示例#1
0
文件: Program.cs 项目: rblenis/cudafy
        // It's interesting to change the number of blocks and the number of threads to
        // understand how to keep the hardware busy.
        //
        // Here are some numbers I get on my G80:
        //    blocks - clocks
        //    1 - 3096
        //    8 - 3232
        //    16 - 3364
        //    32 - 4615
        //    64 - 9981
        //
        // With less than 16 blocks some of the multiprocessors of the device are idle. With
        // more than 16 you are using all the multiprocessors, but there's only one block per
        // multiprocessor and that doesn't allow you to hide the latency of the memory. With
        // more than 32 the speed scales linearly.
        static void Main(string[] args)
        {
            // Init CUDA, select 1st device.
            CUDA cuda = new CUDA(0, true);

            // load module
            cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, "clock_kernel.cubin"));
            CUfunction func = cuda.GetModuleFunction("timedReduction");

            int[]   timer = new int[NUM_BLOCKS * 2];
            float[] input = new float[NUM_THREADS * 2];

            for (int i = 0; i < NUM_THREADS * 2; i++)
            {
                input[i] = (float)i;
            }

            CUdeviceptr dinput  = cuda.CopyHostToDevice <float>(input);
            CUdeviceptr doutput = cuda.Allocate((uint)(sizeof(float) * NUM_BLOCKS));
            CUdeviceptr dtimer  = cuda.Allocate <int>(timer);

            cuda.SetParameter(func, 0, (uint)dinput.Pointer);
            cuda.SetParameter(func, IntPtr.Size, (uint)doutput.Pointer);
            cuda.SetParameter(func, IntPtr.Size * 2, (uint)dtimer.Pointer);
            cuda.SetParameterSize(func, (uint)(IntPtr.Size * 3));

            //timedReduction<<<NUM_BLOCKS, NUM_THREADS, sizeof(float) * 2 * NUM_THREADS>>>(dinput, doutput, dtimer);
            cuda.SetFunctionBlockShape(func, NUM_THREADS, 1, 1);
            cuda.SetFunctionSharedSize(func, (uint)(sizeof(float) * 2 * NUM_THREADS));
            cuda.Launch(func, NUM_BLOCKS, 1);

            cuda.CopyDeviceToHost <int>(dtimer, timer);

            cuda.Free(dinput);
            cuda.Free(doutput);
            cuda.Free(dtimer);

            foreach (int i in timer)
            {
                Console.WriteLine(i);
            }

            Console.WriteLine("Test PASSED");

            int minStart = timer[0];
            int maxEnd   = timer[NUM_BLOCKS];

            for (int i = 1; i < NUM_BLOCKS; i++)
            {
                minStart = timer[i] < minStart ? timer[i] : minStart;
                maxEnd   = timer[NUM_BLOCKS + i] > maxEnd ? timer[NUM_BLOCKS + i] : maxEnd;
            }

            Console.WriteLine("time = {0}", maxEnd - minStart);
        }
示例#2
0
        private static int[] CalculateElementsFrequencies <T>(IList <T> elements, BitmapWrapper bitmapWrapper)
        {
            int frequencesSize      = ProjectConstants.BlockSize * ProjectConstants.GridSize;
            var elementsFrequencies = new int[frequencesSize];

            using (var cuda = new CUDA(0, true))
            {
                var path = Path.Combine(Path.GetDirectoryName(Assembly.GetExecutingAssembly().Location),
                                        "apriori_count1.cubin");

                cuda.LoadModule(path);

                var inputData    = cuda.CopyHostToDevice(bitmapWrapper.RgbValues);
                var inputSetData = cuda.CopyHostToDevice(elements.ToArray());

                var answer = cuda.Allocate(new int[frequencesSize]);


                CallTheFrequencyCount(cuda, inputData, inputSetData, answer, bitmapWrapper, elements.Count);
                cuda.CopyDeviceToHost(answer, elementsFrequencies);

                cuda.Free(inputData);
                cuda.Free(answer);
                cuda.UnloadModule();
            }
            return(elementsFrequencies);
        }
示例#3
0
        private void SetCudaData()
        {
            CudaHelpers.GetNumThreadsAndBlocks(problemSize, maxReductionBlocks, threadsPerBlock, ref reductionThreads, ref reductionBlocks);

            alphaPtr = cuda.CopyHostToDevice(alpha);
            gradPtr  = cuda.CopyHostToDevice(G);
            yPtr     = cuda.CopyHostToDevice(y);


            //kernel columns i,j is simpler to copy array of zeros

            uint memSize = (uint)(sizeof(float) * problemSize * 2);

            kiPtr = cuda.Allocate(memSize);
            kjPtr = kiPtr + sizeof(float) * problemSize;

            //todo:remove it
            int redSize = reductionThreads; //reductionBlocks

            reduceVal = new float[redSize * 2];
            reduceIdx = new int[redSize * 2];


            valRedPtr = cuda.CopyHostToDevice(reduceVal);
            idxRedPtr = cuda.CopyHostToDevice(reduceIdx);


            constCPtr = cuda.GetModuleGlobal(cuModule, "C");
            float[] cData = new float[] { C };
            cuda.CopyHostToDevice(constCPtr, cData);

            constBPtr = cuda.GetModuleGlobal(cuModule, "B");
            B         = new float[] { 0, 0, C };
            cuda.CopyHostToDevice(constBPtr, B);

            constAPtr = cuda.GetModuleGlobal(cuModule, "A");
            A         = new float[] { -C, 0, 0 };
            cuda.CopyHostToDevice(constAPtr, A);


            SetCudaParams();
        }
示例#4
0
        protected void SetCudaData()
        {
            float[] svLabels = new float[sizeSV];
            float[] svAlphas = new float[sizeSV];
            Parallel.For(0, sizeSV,
                         i =>
            {
                int idx = TrainedModel.SupportElementsIndexes[i];

                svLabels[i] = TrainedModel.Y[i];
                //svLabels[i] = TrainningProblem.Labels[idx];
                svAlphas[i] = TrainedModel.Alpha[idx];
            });

            labelsPtr = cuda.CopyHostToDevice(svLabels);
            alphasPtr = cuda.CopyHostToDevice(svAlphas);


            vectorsDimMemSize = (uint)((TrainedModel.SupportElements[0].Dim + 1) * sizeof(float));
            for (int i = 0; i < NUM_STREAMS; i++)
            {
                stream[i] = cuda.CreateStream();

                //allocates memory for one vector, size = vector dim
                mainVecIntPtrs[i] = cuda.AllocateHost(vectorsDimMemSize);
                mainVecCuPtr[i]   = cuda.CopyHostToDevice(mainVecIntPtrs[i], vectorsDimMemSize);

                //allocate memory for output, size == #SV
                evalOutputCuPtr[i] = cuda.Allocate(svAlphas);

                cuVecTexRef[i] = cuda.GetModuleTexture(cuModule, cudaVecTexRefName[i]);
                //cuda.SetTextureFlags(cuVecTexRef[i], 0);
                cuda.SetTextureAddress(cuVecTexRef[i], mainVecCuPtr[i], vectorsDimMemSize);

                uint reduceMemSize = (uint)maxReductionBlocks * sizeof(float);
                reduceIntPtrs[i] = cuda.AllocateHost(reduceMemSize);
                reduceCuPtr[i]   = cuda.CopyHostToDevice(reduceIntPtrs[i], reduceMemSize);
                //reduceCuPtr[i] = cuda.Allocate((uint)maxReductionBlocks * sizeof(float));
            }
        }
示例#5
0
        private int[] CalculateElementsFrequencies(BitmapWrapper bitmapWrapper)
        {
            var elementsFrequencies = new int[bitmapWrapper.Width];

            using (var cuda = new CUDA(0, true))
            {
                var path = Path.Combine(Path.GetDirectoryName(Assembly.GetExecutingAssembly().Location),
                                        Names.CudaAprioriCountModule);

                cuda.LoadModule(path);

                var inputData = cuda.CopyHostToDevice(bitmapWrapper.RgbValues);
                var answer    = cuda.Allocate(new int[bitmapWrapper.Width]);
                CallTheFrequencyCount(cuda, inputData, answer, bitmapWrapper);
                cuda.CopyDeviceToHost(answer, elementsFrequencies);

                cuda.Free(inputData);
                cuda.Free(answer);
                cuda.UnloadModule();
            }
            return(elementsFrequencies);
        }
示例#6
0
        private static int[] CalculateCandidatesFrequencies <T>(IList <List <int> > candidates, BitmapWrapper bitmapWrapper, double borderValue)
        {
            using (var cuda = new CUDA(0, true))
            {
                var elementsFrequencies = new int[candidates.Count];

                var inputSets = new int[candidates.Count * candidates[0].Count];
                var ind       = 0;
                foreach (var candidate in candidates)
                {
                    foreach (var i in candidate)
                    {
                        inputSets[ind] = i;
                        ind++;
                    }
                }

                var path = Path.Combine(Path.GetDirectoryName(Assembly.GetExecutingAssembly().Location),
                                        Names.CudaAprioriCountModule);
                cuda.LoadModule(path);
                var inputData         = cuda.CopyHostToDevice(bitmapWrapper.RgbValues);
                var inputSetData      = cuda.CopyHostToDevice(inputSets);
                var frequenciesOnHost = cuda.Allocate(new int[candidates.Count]);
                var sw = new Stopwatch();
                sw.Start();
                CallTheSetsFrequenciesCount(cuda, inputData, inputSetData, frequenciesOnHost, bitmapWrapper, candidates[0].Count, candidates.Count, borderValue);
                sw.Stop();

                //Console.WriteLine("CalculateCandidatesFrequencies: {0} ms", sw.ElapsedMilliseconds);
                cuda.CopyDeviceToHost(frequenciesOnHost, elementsFrequencies);

                cuda.Free(inputData);
                cuda.Free(inputSetData);
                cuda.Free(frequenciesOnHost);
                cuda.UnloadModule();

                return(elementsFrequencies);
            }
        }
示例#7
0
        static void Main(string[] args)
        {
            // Init and select 1st device.
            CUDA cuda = new CUDA(0, true);

            // load module
            //cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, "transpose_kernel.cubin"));
            cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, "transpose_kernel.ptx"));
            CUfunction transpose = cuda.GetModuleFunction("transpose");
            CUfunction transpose_naive = cuda.GetModuleFunction("transpose_naive");

            const int size_x = 4096;
            const int size_y = 4096;
            const int mem_size = sizeof(float) * size_x * size_y;

            float[] h_idata = new float[size_x * size_y];
            for (int i = 0; i < h_idata.Length; i++)
            {
                h_idata[i] = (float)i;
            }

            // allocate device memory
            // copy host memory to device
            CUdeviceptr d_idata = cuda.CopyHostToDevice<float>(h_idata);
            CUdeviceptr d_odata = cuda.Allocate<float>(h_idata);

            // setup execution parameters
            cuda.SetFunctionBlockShape(transpose_naive, BLOCK_DIM, BLOCK_DIM, 1);
            cuda.SetParameter(transpose_naive, 0, (uint)d_odata.Pointer);
            cuda.SetParameter(transpose_naive, IntPtr.Size, (uint)d_idata.Pointer);
            cuda.SetParameter(transpose_naive, IntPtr.Size * 2, (uint)size_x);
            cuda.SetParameter(transpose_naive, IntPtr.Size * 2 + 4, (uint)size_y);
            cuda.SetParameterSize(transpose_naive, (uint)(IntPtr.Size * 2 + 8));

            cuda.SetFunctionBlockShape(transpose, BLOCK_DIM, BLOCK_DIM, 1);
            cuda.SetParameter(transpose, 0, (uint)d_odata.Pointer);
            cuda.SetParameter(transpose, IntPtr.Size, (uint)d_idata.Pointer);
            cuda.SetParameter(transpose, IntPtr.Size * 2, (uint)size_x);
            cuda.SetParameter(transpose, IntPtr.Size * 2 + 4, (uint)size_y);
            cuda.SetParameterSize(transpose, (uint)(IntPtr.Size * 2 + 8));

            // warmup so we don't time CUDA startup
            cuda.Launch(transpose_naive, size_x / BLOCK_DIM, size_y / BLOCK_DIM);
            cuda.Launch(transpose, size_x / BLOCK_DIM, size_y / BLOCK_DIM);
            //System.Threading.Thread.Sleep(10);
            int numIterations = 100;

            Console.WriteLine("Transposing a {0} by {1} matrix of floats...", size_x, size_y);
            CUevent start = cuda.CreateEvent();
            CUevent end = cuda.CreateEvent();
            cuda.RecordEvent(start);
            for (int i = 0; i < numIterations; i++)
            {
                cuda.Launch(transpose_naive, size_x / BLOCK_DIM, size_y / BLOCK_DIM);
            }
            cuda.SynchronizeContext();
            cuda.RecordEvent(end);
            cuda.SynchronizeContext();
            float naiveTime = cuda.ElapsedTime(start, end);
            Console.WriteLine("Naive transpose average time:     {0} ms\n", naiveTime / numIterations);

            cuda.RecordEvent(start);
            for (int i = 0; i < numIterations; i++)
            {
                cuda.Launch(transpose, size_x / BLOCK_DIM, size_y / BLOCK_DIM);
            }
            cuda.SynchronizeContext();
            cuda.RecordEvent(end);
            cuda.SynchronizeContext();
            float optimizedTime = cuda.ElapsedTime(start, end);

            
            Console.WriteLine("Optimized transpose average time:     {0} ms\n", optimizedTime / numIterations);

            float[] h_odata = new float[size_x * size_y];
            cuda.CopyDeviceToHost<float>(d_odata, h_odata);

            float[] reference = new float[size_x * size_y];
            computeGold(reference, h_idata, size_x, size_y);

            bool res = CompareF(reference, h_odata, size_x * size_y);
            Console.WriteLine("Test {0}", res == true? "PASSED":"FAILED");

            cuda.Free(d_idata);
            cuda.Free(d_odata);

            Console.ReadKey();
        }
示例#8
0
		static private void Worker(object cState)
		{
			try
			{
				Command cCmd;
				CUDA cCUDA = new CUDA(true);
				for (int i = 0; i < 10; i++)
				{
					try
					{
						cCUDA.CreateContext(i);
						(new Logger()).WriteDebug2(i + ": success");
						break;
					}
					catch (Exception ex)
					{
                        (new Logger()).WriteDebug2(i + ": failed");
                        if (Logger.bDebug && Logger.Level.debug3 > Logger.eLevelMinimum)
                            (new Logger()).WriteError(ex);
                    }
				}
				uint nMemoryReservedForMerge = 2 * 1024 * 1024; //PREFERENCES типа <memory reserved="2097152" />
				uint nMemoryStarvationThreshold = cCUDA.TotalMemory / 2; //PREFERENCES через проценты... типа <memory starvation="50%" />
				uint nMemoryFree;
                string sModule = "CUDAFunctions_" + Preferences.nCUDAVersion + "_x" + (IntPtr.Size * 8);
                if (Logger.bDebug)
                    (new Logger()).WriteDebug3(sModule);
                cCUDA.LoadModule((byte[])Properties.Resource.ResourceManager.GetObject(sModule)); //   $(ProjectDir)Resources\CUDAFunctions.cubin
				//cCUDA.LoadModule(@"c:\projects\!helpers\video\PixelsMap\Resources\CUDAFunctions.cubin");
				CUfunction cCUDAFuncMerge = cCUDA.GetModuleFunction("CUDAFrameMerge");
				int nThreadsPerBlock = 256; //пришлось уменьшить с 512 до 256 сридов на блок, потому что при добавлении "движения" и операций с float, ловил ошибку: Too Many Resources Requested for Launch (This error means that the number of registers available on the multiprocessor is being exceeded. Reduce the number of threads per block to solve the problem)
				cCUDA.SetFunctionBlockShape(cCUDAFuncMerge, nThreadsPerBlock, 1, 1);
				CUDADriver.cuParamSetSize(cCUDAFuncMerge, 8);

				Dictionary<ulong, CUdeviceptr> ahDevicePointers = new Dictionary<ulong, CUdeviceptr>();
				CUdeviceptr cPMs;
				CUdeviceptr cInfos;
				CUdeviceptr cAlphaMap;
				{
					//IntPtr[] aPointersByAlpha = new IntPtr[254];  //те самые поинтеры-альфы. Ссылаются на массивы поинтеров B, т.е. BackGrounds
					//IntPtr[] aPointersByBackground = new IntPtr[256];   //  те самые массивы поинтеров B, т.е. BackGrounds
					byte[] aAlphaMap = new byte[16646144];
					int nResult, nIndx = 0;
					for (byte nAlpha = 1; 255 > nAlpha; nAlpha++)
					{
						for (ushort nBackground = 0; 256 > nBackground; nBackground++)
						{
							for (ushort nForeground = 0; 256 > nForeground; nForeground++)
							{
								if (255 < (nResult = (int)((float)(nAlpha * (nForeground - nBackground)) / 255 + nBackground + 0.5)))
									nResult = 255;
								aAlphaMap[nIndx++] = (byte)nResult;
							}
							//aPointersByBackground[nBackground] = (IntPtr)cCUDA.CopyHostToDevice<byte>(aResults).Pointer;
						}
						//aPointersByAlpha[nAlpha - 1] = (IntPtr)cCUDA.CopyHostToDevice<IntPtr>(aPointersByBackground).Pointer;
					}
					cAlphaMap = cCUDA.CopyHostToDevice<byte>(aAlphaMap);
				}
				//{
				//    IntPtr[] aPointersByAlpha = new IntPtr[254];  //те самые поинтеры-альфы. Ссылаются на массивы поинтеров B, т.е. BackGrounds
				//    IntPtr[] aPointersByBackground = new IntPtr[256];   //  те самые массивы поинтеров B, т.е. BackGrounds
				//    byte[] aResults = new byte[256];
				//    int nResult;
				//    for (byte nAlpha = 1; 255 > nAlpha; nAlpha++)
				//    {
				//        for (ushort nBackground = 0; 256 > nBackground; nBackground++)
				//        {
				//            for (ushort nForeground = 0; 256 > nForeground; nForeground++)
				//            {
				//                if (255 < (nResult = (int)((float)(nAlpha * (nForeground - nBackground)) / 255 + nBackground + 0.5)))
				//                    nResult = 255;
				//                aResults[nForeground] = (byte)nResult;
				//            }
				//            aPointersByBackground[nBackground] = (IntPtr)cCUDA.CopyHostToDevice<byte>(aResults).Pointer;
				//        }
				//        aPointersByAlpha[nAlpha - 1] = (IntPtr)cCUDA.CopyHostToDevice<IntPtr>(aPointersByBackground).Pointer;
				//    }
				//    cAlphaMap = cCUDA.CopyHostToDevice<IntPtr>(aPointersByAlpha);
				//}

#if DEBUG
				Dictionary<ulong, DateTime> ahDebug = new Dictionary<ulong,DateTime>();
#endif
				DateTime dtNextTime = DateTime.MinValue, dtNow;
				long nStartTick; // logging
				while (true)
				{
					if (1 > _aqCommands.CountGet() && (dtNow = DateTime.Now) > dtNextTime)
					{
						dtNextTime = dtNow.AddSeconds(60);
#if DEBUG
						dtNow = dtNow.Subtract(TimeSpan.FromHours(2));
						string sMessage = "";
						foreach (ulong nID in ahDebug.Keys)
							if (dtNow > ahDebug[nID])
								sMessage += "<br>[" + nID + ":" + ahDebug[nID].ToString("HH:mm:ss") + "]";
#endif
						(new Logger()).WriteDebug("CUDA free memory:" + cCUDA.FreeMemory
#if DEBUG
							+ "; possibly timeworn allocations:" + (1 > sMessage.Length ? "no" : sMessage)
#endif
						);
					}
					while (true)
					{
						try
						{
							cCmd = _aqCommands.Dequeue();  //если нечего отдать - заснёт
							break;
						}
						catch (Exception ex)
						{
							(new Logger()).WriteError(ex);
						}
					}
					_CommandsCount = _aqCommands.nCount;
					switch (cCmd.eID)
					{
						case Command.ID.Allocate:
							#region
							try
							{
								cCmd.cPM._cException = null;
								if (1 > cCmd.cPM._nID)
								{
									if (0 < cCmd.cPM._nBytesQty)
									{
										nMemoryFree = cCUDA.FreeMemory;
										if (nMemoryReservedForMerge < nMemoryFree - cCmd.cPM._nBytesQty)
										{
											bMemoryStarvation = (nMemoryFree < nMemoryStarvationThreshold);
											cCmd.cPM._nID = _nCurrentID++;
											ahDevicePointers.Add(cCmd.cPM._nID, cCUDA.Allocate(cCmd.cPM._nBytesQty));
#if DEBUG
											ahDebug.Add(cCmd.cPM._nID, DateTime.Now);
#endif
										}
										else
										{
											bMemoryStarvation = true;
											throw new Exception("out of memory in CUDA device during Allocate. Only 2 MBytes reserved for the Merge");
										}
									}
									else
										throw new Exception("bytes quantity in PixelsMap have to be greater than zero for Allocate [_bDisposed = " + cCmd.cPM._bDisposed + "][_bProcessing = " + cCmd.cPM._bProcessing + "][_bShiftVertical = " + cCmd.cPM._bShiftVertical + "][_bTemp = " + cCmd.cPM._bTemp + "][_dt = " + cCmd.cPM._dt + "][_nBytesQty = " + cCmd.cPM._nBytesQty + "][_nID = " + cCmd.cPM._nID + "][_nShiftPosition = " + cCmd.cPM._nShiftPosition + "][_stArea.nHeight = " + cCmd.cPM._stArea.nHeight + "][_stArea.nWidth = " + cCmd.cPM._stArea.nWidth + "][bKeepAlive = " + cCmd.cPM.bKeepAlive + "][bBackgroundClear = " + cCmd.cPM.bBackgroundClear + "][eAlpha = " + cCmd.cPM.eAlpha + "][bCUDA = " + cCmd.cPM.bCUDA + "][nAlphaConstant = " + cCmd.cPM.nAlphaConstant + "][nID = " + cCmd.cPM.nID + "][nLength = " + cCmd.cPM.nLength + "][stArea.nHeight = " + cCmd.cPM.stArea.nHeight + "][stArea.nWidth = " + cCmd.cPM.stArea.nWidth + "]");
								}
								else
									throw new Exception("PixelsMap ID have to be zero for Allocate");
							}
							catch (Exception ex)
							{
								if (ex is CUDAException)
									ex = new Exception("CUDA Error:" + ((CUDAException)ex).CUDAError.ToString(), ex);
								(new Logger()).WriteError(ex);
								(new Logger()).WriteDebug("bytes qty:" + cCmd.cPM._nBytesQty);
								cCmd.cPM._cException = ex;
							}
							cCmd.cMRE.Set();
							break;
							#endregion
						case Command.ID.CopyIn:
							#region
							nStartTick = DateTime.Now.Ticks; // logging
							try
							{
								cCmd.cPM._cException = null;
								if (1 > cCmd.cPM._nID)
								{
									if (cCUDA.FreeMemory - cCmd.cPM._nBytesQty > nMemoryReservedForMerge)
									{
										cCmd.cPM._nID = _nCurrentID++;
										if (cCmd.ahParameters.ContainsKey(typeof(IntPtr)))
											ahDevicePointers.Add(cCmd.cPM._nID, cCUDA.CopyHostToDevice((IntPtr)cCmd.ahParameters[typeof(IntPtr)], cCmd.cPM._nBytesQty));
										else if (cCmd.ahParameters.ContainsKey(typeof(byte[])))
											ahDevicePointers.Add(cCmd.cPM._nID, cCUDA.CopyHostToDevice((byte[])cCmd.ahParameters[typeof(byte[])]));
										else
											throw new Exception("unknown parameter type");
#if DEBUG
											ahDebug.Add(cCmd.cPM._nID, DateTime.Now);
#endif
									}
									else
										throw new Exception("out of memory in CUDA device during CopyIn. Only 2 MBytes reserved for the Merge.");
								}
								else
								{
									if (cCmd.ahParameters.ContainsKey(typeof(IntPtr)))
										cCUDA.CopyHostToDevice(ahDevicePointers[cCmd.cPM._nID], (IntPtr)cCmd.ahParameters[typeof(IntPtr)], cCmd.cPM._nBytesQty);
									else if (cCmd.ahParameters.ContainsKey(typeof(byte[])))
										cCUDA.CopyHostToDevice(ahDevicePointers[cCmd.cPM._nID], (byte[])cCmd.ahParameters[typeof(byte[])]);
									else
										throw new Exception("unknown parameter type");
								}
								if (ahDevicePointers.ContainsKey(cCmd.cPM._nID))
									(new Logger()).WriteDebug5("copy in [id:" + cCmd.cPM._nID + "][ptr:" + ahDevicePointers[cCmd.cPM._nID].Pointer + "]");
								else
									(new Logger()).WriteDebug5("copy in [id:" + cCmd.cPM._nID + "][ptr: not in dictionary]");
							}
							catch (Exception ex)
							{
								if (ex is CUDAException)
									ex = new Exception("CUDA Error:" + ((CUDAException)ex).CUDAError.ToString(), ex);
								(new Logger()).WriteError(ex);
								cCmd.cPM._cException = ex;
							}
							if (new TimeSpan(DateTime.Now.Ticks - nStartTick).TotalMilliseconds >= 20)    // logging
								(new Logger()).WriteNotice("PixelMap: Command.ID.CopyIn: execution time > 20ms: " + new TimeSpan(DateTime.Now.Ticks - nStartTick).TotalMilliseconds +"ms");    // logging
							cCmd.cMRE.Set();
							break;
							#endregion
						case Command.ID.CopyOut:
							#region
							nStartTick = DateTime.Now.Ticks; // logging
							try
							{
								if (0 < cCmd.cPM._nID)
								{
									if(!cCmd.ahParameters.ContainsKey(typeof(IntPtr)))
									{
										if(cCmd.ahParameters.ContainsKey(typeof(byte[])))
										{
											cCmd.cPM._aBytes = (byte[])cCmd.ahParameters[typeof(byte[])];
											if(cCmd.cPM._nBytesQty != cCmd.cPM._aBytes.Length)
												(new Logger()).WriteWarning("wrong array size for copyout [got:" + cCmd.cPM._aBytes.Length + "][expected:" + cCmd.cPM._nBytesQty + "]");
										}
										else
											cCmd.cPM._aBytes = new byte[cCmd.cPM._nBytesQty];
										cCUDA.CopyDeviceToHost<byte>(ahDevicePointers[cCmd.cPM._nID], cCmd.cPM._aBytes);
									}
									else 
										cCUDA.CopyDeviceToHost(ahDevicePointers[cCmd.cPM._nID], (IntPtr)cCmd.ahParameters[typeof(IntPtr)], cCmd.cPM._nBytesQty);
									(new Logger()).WriteDebug5("copy out [id:" + cCmd.cPM._nID + "][ptr:" + ahDevicePointers[cCmd.cPM._nID].Pointer + "]");
								}
								else
									throw new Exception("PixelsMap have to be allocated for CopyOut");
							}
							catch (Exception ex)
							{
								if (ex is CUDAException)
									ex = new Exception("CUDA Error:" + ((CUDAException)ex).CUDAError.ToString(), ex);
								(new Logger()).WriteError(ex);
								cCmd.cPM._cException = ex;
							}
							if (new TimeSpan(DateTime.Now.Ticks - nStartTick).TotalMilliseconds >= 20)    // logging
								(new Logger()).WriteNotice("PixelMap: Command.ID.CopyOut: execution time > 20ms: " + new TimeSpan(DateTime.Now.Ticks - nStartTick).TotalMilliseconds +"ms");    // logging
							cCmd.cMRE.Set();
							break;
							#endregion
						case Command.ID.Merge:
							#region
							try
							{
								List<PixelsMap> aPMs = (List<PixelsMap>)cCmd.ahParameters[typeof(List<PixelsMap>)];
								DisCom.MergeInfo cMergeInfo = (DisCom.MergeInfo)cCmd.ahParameters[typeof(DisCom.MergeInfo)];
								List<IntPtr> aDPs = new List<IntPtr>();

								if (1 > cCmd.cPM._nID)
									throw new Exception("background PixelsMap have to be allocated for Merge");

								aDPs.Add((IntPtr)ahDevicePointers[cCmd.cPM._nID].Pointer);
								for (int nIndx = 0; nIndx < aPMs.Count; nIndx++)
								{
									if (!ahDevicePointers.ContainsKey(aPMs[nIndx]._nID))
										throw new Exception("there is a corrupted ID in layers for merge [id:" + aPMs[nIndx]._nID + "]");
									if (1 > ahDevicePointers[aPMs[nIndx]._nID].Pointer)
										throw new Exception("there is an empty pointer in layers for merge [id:" + aPMs[nIndx]._nID + "]");
									aDPs.Add((IntPtr)ahDevicePointers[aPMs[nIndx]._nID].Pointer);
								}

								cPMs = cCUDA.CopyHostToDevice<IntPtr>(aDPs.ToArray());
								cInfos = cCUDA.CopyHostToDevice(cMergeInfo, cMergeInfo.SizeGet());

								cCUDA.SetParameter<IntPtr>(cCUDAFuncMerge, 0, (IntPtr)cPMs.Pointer);
								cCUDA.SetParameter<IntPtr>(cCUDAFuncMerge, IntPtr.Size, (IntPtr)cInfos.Pointer);
								cCUDA.SetParameter<IntPtr>(cCUDAFuncMerge, IntPtr.Size * 2, (IntPtr)cAlphaMap.Pointer);
								cCUDA.SetParameterSize(cCUDAFuncMerge, (uint)(IntPtr.Size * 3));
								int nIterations = (0 == cMergeInfo.nBackgroundSize % nThreadsPerBlock ? cMergeInfo.nBackgroundSize / nThreadsPerBlock : cMergeInfo.nBackgroundSize / nThreadsPerBlock + 1);
								cCUDA.Launch(cCUDAFuncMerge, nIterations, 1);
								cCmd.cMRE.Set();

                                cMergeInfo.Dispose();

								cCUDA.Free(cPMs);
								cCUDA.Free(cInfos);
								for (int nIndx = 0; nIndx < aPMs.Count; nIndx++)
								{
									lock (aPMs[nIndx]._cSyncRoot)
										aPMs[nIndx]._bProcessing = false;
									aPMs[nIndx].Dispose();
								}
							}
							catch (Exception ex)
							{
								cCmd.cMRE.Set();
								if (ex is CUDAException)
									ex = new Exception("CUDA Error:" + ((CUDAException)ex).CUDAError.ToString(), ex);
								(new Logger()).WriteError(ex);
								cCmd.cPM._cException = ex;
							}
							break;
							#endregion
						case Command.ID.Dispose:
							#region
							nStartTick = DateTime.Now.Ticks; // logging
							(new Logger()).Write(Logger.Level.debug2, "dispose: in");
							try
							{
								if (ahDevicePointers.ContainsKey(cCmd.cPM._nID))
								{
									if (0 < cCmd.cPM._nID && 0 < ahDevicePointers[cCmd.cPM._nID].Pointer)
									{
										cCUDA.Free(ahDevicePointers[cCmd.cPM._nID]);
										//cCUDA.SynchronizeContext();
										bMemoryStarvation = (cCUDA.FreeMemory < nMemoryStarvationThreshold);
										(new Logger()).WriteDebug3("dispose [id:" + cCmd.cPM._nID + "][ptr:" + ahDevicePointers[cCmd.cPM._nID].Pointer + "]");
									}
									ahDevicePointers.Remove(cCmd.cPM._nID);
#if DEBUG
									ahDebug.Remove(cCmd.cPM._nID);
#endif
									cCmd.cPM._nID = 0;
								}
							}
							catch (Exception ex)
							{
								if (ex is CUDAException)
									ex = new Exception("CUDA Error:" + ((CUDAException)ex).CUDAError.ToString(), ex);
								(new Logger()).WriteError(ex);
								cCmd.cPM._cException = ex;
							}
							(new Logger()).Write(Logger.Level.debug2, "dispose: out");
							if (new TimeSpan(DateTime.Now.Ticks - nStartTick).TotalMilliseconds >= 20)    // logging
								(new Logger()).WriteNotice("PixelMap: Command.ID.Dispose: execution time > 20ms: " + new TimeSpan(DateTime.Now.Ticks - nStartTick).TotalMilliseconds +"ms");    // logging
							break;
							#endregion
					}
				}
			}
			catch (Exception ex)
			{
				(new Logger()).WriteError(ex);
			}
		}
		unsafe public FlaCudaTask(CUDA _cuda, int channelCount, int channels, uint bits_per_sample, int max_frame_size, bool do_verify)
		{
			cuda = _cuda;

			residualTasksLen = sizeof(FlaCudaSubframeTask) * channelCount * (lpc.MAX_LPC_ORDER * lpc.MAX_LPC_WINDOWS + 8) * FlaCudaWriter.maxFrames;
			bestResidualTasksLen = sizeof(FlaCudaSubframeTask) * channelCount * FlaCudaWriter.maxFrames;
			samplesBufferLen = sizeof(int) * FlaCudaWriter.MAX_BLOCKSIZE * channelCount;
			int partitionsLen = sizeof(int) * (30 << 8) * channelCount * FlaCudaWriter.maxFrames;
			int riceParamsLen = sizeof(int) * (4 << 8) * channelCount * FlaCudaWriter.maxFrames;
			int lpcDataLen = sizeof(float) * 32 * 33 * lpc.MAX_LPC_WINDOWS * channelCount * FlaCudaWriter.maxFrames;

			cudaSamplesBytes = cuda.Allocate((uint)samplesBufferLen / 2);
			cudaSamples = cuda.Allocate((uint)samplesBufferLen);
			cudaResidual = cuda.Allocate((uint)samplesBufferLen);
			cudaLPCData = cuda.Allocate((uint)lpcDataLen);
			cudaPartitions = cuda.Allocate((uint)partitionsLen);
			cudaRiceParams = cuda.Allocate((uint)riceParamsLen);
			cudaBestRiceParams = cuda.Allocate((uint)riceParamsLen / 4);
			cudaAutocorOutput = cuda.Allocate((uint)(sizeof(float) * channelCount * lpc.MAX_LPC_WINDOWS * (lpc.MAX_LPC_ORDER + 1) * (FlaCudaWriter.maxAutocorParts + FlaCudaWriter.maxFrames)));
			cudaResidualTasks = cuda.Allocate((uint)residualTasksLen);
			cudaBestResidualTasks = cuda.Allocate((uint)bestResidualTasksLen);
			cudaResidualOutput = cuda.Allocate((uint)(sizeof(int) * channelCount * (lpc.MAX_LPC_WINDOWS * lpc.MAX_LPC_ORDER + 8) * 64 /*FlaCudaWriter.maxResidualParts*/ * FlaCudaWriter.maxFrames));
			CUResult cuErr = CUResult.Success;
			if (cuErr == CUResult.Success)
				cuErr = CUDADriver.cuMemAllocHost(ref samplesBytesPtr, (uint)samplesBufferLen/2);
			if (cuErr == CUResult.Success)
				cuErr = CUDADriver.cuMemAllocHost(ref residualBufferPtr, (uint)samplesBufferLen);
			if (cuErr == CUResult.Success)
				cuErr = CUDADriver.cuMemAllocHost(ref bestRiceParamsPtr, (uint)riceParamsLen / 4);
			if (cuErr == CUResult.Success)
				cuErr = CUDADriver.cuMemAllocHost(ref residualTasksPtr, (uint)residualTasksLen);
			if (cuErr == CUResult.Success)
				cuErr = CUDADriver.cuMemAllocHost(ref bestResidualTasksPtr, (uint)bestResidualTasksLen);
			if (cuErr != CUResult.Success)
			{
				if (samplesBytesPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(samplesBytesPtr); samplesBytesPtr = IntPtr.Zero;
				if (residualBufferPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(residualBufferPtr); residualBufferPtr = IntPtr.Zero;
				if (bestRiceParamsPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(bestRiceParamsPtr); bestRiceParamsPtr = IntPtr.Zero;
				if (residualTasksPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(residualTasksPtr); residualTasksPtr = IntPtr.Zero;
				if (bestResidualTasksPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(bestResidualTasksPtr); bestResidualTasksPtr = IntPtr.Zero;
				throw new CUDAException(cuErr);
			}

			cudaComputeAutocor = cuda.GetModuleFunction("cudaComputeAutocor");
			cudaStereoDecorr = cuda.GetModuleFunction("cudaStereoDecorr");
			cudaChannelDecorr = cuda.GetModuleFunction("cudaChannelDecorr");
			cudaChannelDecorr2 = cuda.GetModuleFunction("cudaChannelDecorr2");
			cudaFindWastedBits = cuda.GetModuleFunction("cudaFindWastedBits");
			cudaComputeLPC = cuda.GetModuleFunction("cudaComputeLPC");
			cudaQuantizeLPC = cuda.GetModuleFunction("cudaQuantizeLPC");
			cudaComputeLPCLattice = cuda.GetModuleFunction("cudaComputeLPCLattice");
			cudaEstimateResidual = cuda.GetModuleFunction("cudaEstimateResidual");
			cudaEstimateResidual8 = cuda.GetModuleFunction("cudaEstimateResidual8");
			cudaEstimateResidual12 = cuda.GetModuleFunction("cudaEstimateResidual12");
			cudaEstimateResidual1 = cuda.GetModuleFunction("cudaEstimateResidual1");
			cudaChooseBestMethod = cuda.GetModuleFunction("cudaChooseBestMethod");
			cudaCopyBestMethod = cuda.GetModuleFunction("cudaCopyBestMethod");
			cudaCopyBestMethodStereo = cuda.GetModuleFunction("cudaCopyBestMethodStereo");
			cudaEncodeResidual = cuda.GetModuleFunction("cudaEncodeResidual");
			cudaCalcPartition = cuda.GetModuleFunction("cudaCalcPartition");
			cudaCalcPartition16 = cuda.GetModuleFunction("cudaCalcPartition16");
			cudaCalcLargePartition = cuda.GetModuleFunction("cudaCalcLargePartition");
			cudaSumPartition = cuda.GetModuleFunction("cudaSumPartition");
			cudaFindRiceParameter = cuda.GetModuleFunction("cudaFindRiceParameter");
			cudaFindPartitionOrder = cuda.GetModuleFunction("cudaFindPartitionOrder");

			stream = cuda.CreateStream();
			samplesBuffer = new int[FlaCudaWriter.MAX_BLOCKSIZE * channelCount];
			outputBuffer = new byte[max_frame_size * FlaCudaWriter.maxFrames + 1];
			frame = new FlacFrame(channelCount);
			frame.writer = new BitWriter(outputBuffer, 0, outputBuffer.Length);

			if (do_verify)
			{
				verify = new FlakeReader(new AudioPCMConfig((int)bits_per_sample, channels, 44100));
				verify.DoCRC = false;
			}
		}
示例#10
0
文件: Program.cs 项目: rblenis/cudafy
        static void Main(string[] args)
        {
            // Init and select 1st device.
            CUDA cuda = new CUDA(0, true);

            // load module
            //cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, "transpose_kernel.cubin"));
            cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, "transpose_kernel.ptx"));
            CUfunction transpose       = cuda.GetModuleFunction("transpose");
            CUfunction transpose_naive = cuda.GetModuleFunction("transpose_naive");

            const int size_x   = 4096;
            const int size_y   = 4096;
            const int mem_size = sizeof(float) * size_x * size_y;

            float[] h_idata = new float[size_x * size_y];
            for (int i = 0; i < h_idata.Length; i++)
            {
                h_idata[i] = (float)i;
            }

            // allocate device memory
            // copy host memory to device
            CUdeviceptr d_idata = cuda.CopyHostToDevice <float>(h_idata);
            CUdeviceptr d_odata = cuda.Allocate <float>(h_idata);

            // setup execution parameters
            cuda.SetFunctionBlockShape(transpose_naive, BLOCK_DIM, BLOCK_DIM, 1);
            cuda.SetParameter(transpose_naive, 0, (uint)d_odata.Pointer);
            cuda.SetParameter(transpose_naive, IntPtr.Size, (uint)d_idata.Pointer);
            cuda.SetParameter(transpose_naive, IntPtr.Size * 2, (uint)size_x);
            cuda.SetParameter(transpose_naive, IntPtr.Size * 2 + 4, (uint)size_y);
            cuda.SetParameterSize(transpose_naive, (uint)(IntPtr.Size * 2 + 8));

            cuda.SetFunctionBlockShape(transpose, BLOCK_DIM, BLOCK_DIM, 1);
            cuda.SetParameter(transpose, 0, (uint)d_odata.Pointer);
            cuda.SetParameter(transpose, IntPtr.Size, (uint)d_idata.Pointer);
            cuda.SetParameter(transpose, IntPtr.Size * 2, (uint)size_x);
            cuda.SetParameter(transpose, IntPtr.Size * 2 + 4, (uint)size_y);
            cuda.SetParameterSize(transpose, (uint)(IntPtr.Size * 2 + 8));

            // warmup so we don't time CUDA startup
            cuda.Launch(transpose_naive, size_x / BLOCK_DIM, size_y / BLOCK_DIM);
            cuda.Launch(transpose, size_x / BLOCK_DIM, size_y / BLOCK_DIM);
            //System.Threading.Thread.Sleep(10);
            int numIterations = 100;

            Console.WriteLine("Transposing a {0} by {1} matrix of floats...", size_x, size_y);
            CUevent start = cuda.CreateEvent();
            CUevent end   = cuda.CreateEvent();

            cuda.RecordEvent(start);
            for (int i = 0; i < numIterations; i++)
            {
                cuda.Launch(transpose_naive, size_x / BLOCK_DIM, size_y / BLOCK_DIM);
            }
            cuda.SynchronizeContext();
            cuda.RecordEvent(end);
            cuda.SynchronizeContext();
            float naiveTime = cuda.ElapsedTime(start, end);

            Console.WriteLine("Naive transpose average time:     {0} ms\n", naiveTime / numIterations);

            cuda.RecordEvent(start);
            for (int i = 0; i < numIterations; i++)
            {
                cuda.Launch(transpose, size_x / BLOCK_DIM, size_y / BLOCK_DIM);
            }
            cuda.SynchronizeContext();
            cuda.RecordEvent(end);
            cuda.SynchronizeContext();
            float optimizedTime = cuda.ElapsedTime(start, end);


            Console.WriteLine("Optimized transpose average time:     {0} ms\n", optimizedTime / numIterations);

            float[] h_odata = new float[size_x * size_y];
            cuda.CopyDeviceToHost <float>(d_odata, h_odata);

            float[] reference = new float[size_x * size_y];
            computeGold(reference, h_idata, size_x, size_y);

            bool res = CompareF(reference, h_odata, size_x * size_y);

            Console.WriteLine("Test {0}", res == true? "PASSED":"FAILED");

            cuda.Free(d_idata);
            cuda.Free(d_odata);

            Console.ReadKey();
        }
示例#11
0
        private static float[] CuDotProdEllPackTexCached()
        {
            //always the same values
            Random rnd = new Random(1);

            CUDA cuda = new CUDA(0, true);

            // load module
            CUmodule module = cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, "structKernel.cubin"));

            CUfunction structPassFunc = cuda.GetModuleFunction("DotProdEllPackCached");

            int maxRowSize = avgElements + stdElements - 1;

            Console.WriteLine("init arrays");
            Stopwatch t = Stopwatch.StartNew();
            float[] vecVals = new float[N * maxRowSize];
            int[] vecIdx = new int[N * maxRowSize];

            maxIndex = 0;
            for (int i = 0; i < N; i++)
            {
                int vecSize = avgElements + i % stdElements;

                float[] vals = Helpers.InitValues(i, vecSize, maxVal);

                //values are column-major aligment
                for (int z = 0; z < vals.Length; z++)
                {
                    int m = z * N + i;
                    vecVals[m] = vals[z];
                }

                //Array.Copy(vals,0,vecVals,i*maxRowSize,vals.Length);

                int[] index = Helpers.InitIndices(i, vecSize, ref maxIndex);
                //Array.Copy(index, 0, vecIdx, i * maxRowSize, index.Length);
                for (int z = 0; z < index.Length; z++)
                {
                    int m = z * N + i;
                    vecIdx[m] = index[z];
                }

            }

            float[] mainVec = new float[maxIndex + 1];

            for (int j = 0; j < maxRowSize; j++)
            {
                int idx = vecIdx[mainIndex + N * j];
                float val = vecVals[mainIndex + N * j];
                mainVec[idx] = val;
            }
            Console.WriteLine("Init takes {0}", t.Elapsed);
            t.Start();

            CUdeviceptr valsPtr = cuda.CopyHostToDevice(vecVals);
            CUdeviceptr idxPtr = cuda.CopyHostToDevice(vecIdx);

            CUarray cuArr = cuda.CreateArray(mainVec);
            cuda.CopyHostToArray(cuArr, mainVec, 0);

            //CUDAArrayDescriptor cuDesc = new CUDAArrayDescriptor();
            //cuDesc.Format = CUArrayFormat.Float;
            //cuDesc.NumChannels = 1;
            //cuDesc.Width = maxIndex+1;

            CUtexref cuTexRef = cuda.GetModuleTexture(module, "texRef");
            cuda.SetTextureFlags(cuTexRef, 0);

            cuda.SetTextureArray(cuTexRef, cuArr);

            float[] output = new float[N];
            CUdeviceptr dOutput = cuda.Allocate(output);

            Console.WriteLine("copy to device takes {0}", t.Elapsed);

            cuda.SetFunctionBlockShape(structPassFunc, threadsPerBlock, 1, 1);

            int offset = 0;
            cuda.SetParameter(structPassFunc, offset, valsPtr.Pointer);
            offset += IntPtr.Size;
            cuda.SetParameter(structPassFunc, offset, idxPtr.Pointer);
            offset += IntPtr.Size;

            cuda.SetParameter(structPassFunc, offset, dOutput.Pointer);
            offset += IntPtr.Size;

            cuda.SetParameter(structPassFunc, offset, (uint)maxRowSize);
            offset += sizeof(int);
            cuda.SetParameter(structPassFunc, offset, (uint)N);
            offset += sizeof(int);
            cuda.SetParameterSize(structPassFunc, (uint)offset);

            Console.WriteLine("start computation");

            CUevent start = cuda.CreateEvent();
            CUevent end = cuda.CreateEvent();

            Stopwatch timer = Stopwatch.StartNew();
            cuda.RecordEvent(start);
            cuda.Launch(structPassFunc, blocksPerGrid, 1);

            cuda.RecordEvent(end);

            cuda.SynchronizeContext();
            //cuda.SynchronizeEvent(end);
            timer.Stop();
            float naiveTime = cuda.ElapsedTime(start, end);

            Console.Write("EllPack Cached Dot products with mainIndex {0} and {1}-vectors takes {2} ms stopwatch time {3} ms", mainIndex, N, naiveTime, timer.Elapsed);

            cuda.CopyDeviceToHost(dOutput, output);

            int lenght = Math.Min(displayCount, N);
            Console.WriteLine();
            for (int i = 0; i < lenght; i++)
            {
                Console.WriteLine("{0}-{1}", i, output[i]);
            }

            cuda.Free(valsPtr);
            cuda.Free(idxPtr);
            cuda.Free(dOutput);
            cuda.DestroyArray(cuArr);
            cuda.DestroyTexture(cuTexRef);
            return output;
        }
示例#12
0
        private static void CuAddVec()
        {
            int N = 50000;
            uint size = (uint)N * sizeof(float);

            CUDA cuda = new CUDA(0, true);

            // load module
            cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, "structKernel.cubin"));
            CUfunction vecAddFunc = cuda.GetModuleFunction("VecAdd");

            float[] A = new float[N];
            float[] B = new float[N];
            float[] C = new float[N];
            for (int i = 0; i < A.Length; i++)
            {
                A[i] = (float)i;
                B[i] = (float)i + 0.1f;
            }

            CUdeviceptr dA = cuda.CopyHostToDevice(A);
            CUdeviceptr dB = cuda.CopyHostToDevice(B);

            CUdeviceptr dC = cuda.Allocate(A);

            int threadsPerBlock = 256;
            int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;

            //error = cuFuncSetBlockShape(vecAdd, threadsPerBlock, 1, 1);

            cuda.SetFunctionBlockShape(vecAddFunc, threadsPerBlock, 1, 1);

            int offset = 0;
            cuda.SetParameter(vecAddFunc, offset, (uint)dA.Pointer);
            offset += IntPtr.Size;
            cuda.SetParameter(vecAddFunc, offset, (uint)dB.Pointer);
            offset += IntPtr.Size;
            cuda.SetParameter(vecAddFunc, offset, (uint)dC.Pointer);
            offset += IntPtr.Size;
            cuda.SetParameter(vecAddFunc, offset, (uint)N);
            offset += sizeof(int);
            cuda.SetParameterSize(vecAddFunc, (uint)offset);

            CUevent start = cuda.CreateEvent();
            CUevent end = cuda.CreateEvent();

            cuda.RecordEvent(start);
            cuda.Launch(vecAddFunc, blocksPerGrid, 1);
            cuda.RecordEvent(end);

            cuda.SynchronizeContext();
            //cuda.SynchronizeEvent(end);

            float naiveTime = cuda.ElapsedTime(start, end);
            Console.Write("adding takes {0}ms", naiveTime);

            cuda.CopyDeviceToHost(dC, C);

            for (int i = 0; i < 10; i++)
            {
                Console.WriteLine("{0}-{1}", i, C[i]);
            }
        }
示例#13
0
        private static unsafe void CuStructPass()
        {
            int N = 4;

            int sparseVecSize = sizeof(SparseVecPtr);

            uint size = (uint)(N * sizeof(SparseVecPtr));

            CUDA cuda = new CUDA(0, true);

            // load module
            cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, "structKernel.cubin"));
            CUfunction structPassFunc = cuda.GetModuleFunction("StructPass");

            SparseVecPtr[] vectors = new SparseVecPtr[N];

            for (int i = 0; i < N; i++)
            {
                vectors[i] = new SparseVecPtr();
                vectors[i].size = 2;
                float[] vals = new float[2] { (float)i + 1 % 5, (float)i + 2 % 7 };

                //GCHandle valHandle = GCHandle.Alloc(vals, GCHandleType.Pinned);
                //vectors[i].values = valHandle.AddrOfPinnedObject();

                int[] index = new int[2] { i % 5, i % 7 };
                //GCHandle idxHandle = GCHandle.Alloc(index, GCHandleType.Pinned);
                //vectors[i].indices = idxHandle.AddrOfPinnedObject();

                //valHandle.Free();
                //idxHandle.Free();

                CUdeviceptr valsPtr = cuda.CopyHostToDevice(vals);
                CUdeviceptr idxPtr = cuda.CopyHostToDevice(index);

                vectors[i].indices = new IntPtr(idxPtr.Pointer);
                vectors[i].values = (IntPtr)valsPtr.Pointer;

            }

            GCHandle handle = GCHandle.Alloc(vectors, GCHandleType.Pinned);
            IntPtr ptr = handle.AddrOfPinnedObject();

            float[] output = new float[N];

            //CUdeviceptr dVectors = cuda.CopyHostToDevice(vectors);

            CUdeviceptr dVectors = cuda.CopyHostToDevice(ptr, size);
            CUdeviceptr dOutput = cuda.Allocate(output);

            int threadsPerBlock = 256;
            int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;

            //error = cuFuncSetBlockShape(vecAdd, threadsPerBlock, 1, 1);

            cuda.SetFunctionBlockShape(structPassFunc, threadsPerBlock, 1, 1);

            int offset = 0;
            cuda.SetParameter(structPassFunc, offset, (uint)dVectors.Pointer);
            offset += IntPtr.Size;
            cuda.SetParameter(structPassFunc, offset, (uint)dOutput.Pointer);
            offset += IntPtr.Size;
            cuda.SetParameter(structPassFunc, offset, (uint)N);
            offset += sizeof(int);
            cuda.SetParameterSize(structPassFunc, (uint)offset);

            CUevent start = cuda.CreateEvent();
            CUevent end = cuda.CreateEvent();

            cuda.RecordEvent(start);
            cuda.Launch(structPassFunc, blocksPerGrid, 1);
            cuda.RecordEvent(end);

            cuda.SynchronizeContext();
            //cuda.SynchronizeEvent(end);

            float naiveTime = cuda.ElapsedTime(start, end);
            Console.Write("passing struct takes {0}ms", naiveTime);

            cuda.CopyDeviceToHost(dOutput, output);

            int lenght = Math.Min(10, N);
            Console.WriteLine();
            for (int i = 0; i < lenght; i++)
            {
                Console.WriteLine("{0}-{1}", i, output[i]);
            }
        }
示例#14
0
        static void Main(string[] args)
        {
            // Create a new instance of CUDA class, select 1st device.
            CUDA cuda = new CUDA(0, true);

            // Prepare parameters.
            int  n      = 16 * 1024 * 1024;
            uint nbytes = (uint)(n * sizeof(int));
            int  value  = 26;

            // allocate host memory
            int[] a = new int[n];

            // allocate device memory
            CUdeviceptr d_a = cuda.Allocate <int>(a);

            CUDADriver.cuMemsetD8(d_a, 0xff, nbytes);

            // load module
            cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, "asyncAPI.ptx"));
            CUfunction func = cuda.GetModuleFunction("increment_kernel");

            // set kernel launch configuration
            cuda.SetFunctionBlockShape(func, 512, 1, 1);

            // create cuda event handles
            CUevent start = cuda.CreateEvent();
            CUevent stop  = cuda.CreateEvent();

            // asynchronously issue work to the GPU (all to stream 0)
            CUstream stream = new CUstream();

            cuda.RecordEvent(start);
            cuda.CopyHostToDeviceAsync <int>(d_a, a, stream);

            // set parameters for kernel function
            cuda.SetParameter(func, 0, (uint)d_a.Pointer);
            cuda.SetParameter(func, IntPtr.Size, (uint)value);

            cuda.SetParameterSize(func, (uint)(IntPtr.Size + 4));

            // actually launch kernel
            cuda.LaunchAsync(func, n / 512, 1, stream);

            // wait for every thing to finish, then start copy back data
            cuda.CopyDeviceToHostAsync <int>(d_a, a, stream);

            cuda.RecordEvent(stop);

            // print the cpu and gpu times
            Console.WriteLine("time spent executing by the GPU: {0} ms", cuda.ElapsedTime(start, stop));

            // check the output for correctness
            if (CorrectOutput(a, value))
            {
                Console.WriteLine("Test PASSED");
            }
            else
            {
                Console.WriteLine("Test FAILED");
            }

            // release resources
            cuda.DestroyEvent(start);
            cuda.DestroyEvent(stop);
            cuda.Free(d_a);
        }
示例#15
0
        private void SetCudaData(Problem <SparseVec> sub_prob)
        {
            int vecDim = sub_prob.FeaturesCount;//.Elements[0].Dim;

            /*
             * copy vectors to CUDA device
             */

            #region copy trainning examples to GPU

            float[] vecVals;
            int[]   vecIdx;
            int[]   vecLenght;
            CudaHelpers.TransformToCSRFormat(out vecVals, out vecIdx, out vecLenght, sub_prob.Elements);
            valsCSRPtr      = cuda.CopyHostToDevice(vecVals);
            idxCSRPtr       = cuda.CopyHostToDevice(vecIdx);
            vecLenghtCSRPtr = cuda.CopyHostToDevice(vecLenght);


            CudaHelpers.TransformToCSCFormat(out vecVals, out vecIdx, out vecLenght, sub_prob.Elements);
            valsCSCPtr      = cuda.CopyHostToDevice(vecVals);
            idxCSCPtr       = cuda.CopyHostToDevice(vecIdx);
            vecLenghtCSCPtr = cuda.CopyHostToDevice(vecLenght);

            #endregion

            /*
             * allocate memory for gradient
             */
            alphaMemSize = (uint)(sub_prob.ElementsCount * sizeof(float));

            gradPtr    = cuda.Allocate(alphaMemSize);
            gradOldPtr = cuda.Allocate(alphaMemSize);

            alphaPtr    = cuda.Allocate(alphaMemSize);
            alphaOldPtr = cuda.Allocate(alphaMemSize);
            alphaTmpPtr = cuda.Allocate(alphaMemSize);


            /*
             * reduction blocks for computing Obj
             */



            GetNumThreadsAndBlocks(vecDim, 64, threadsPerBlock, ref threadsForReduceObjW, ref bpgReduceW);

            reduceObjW = new float[bpgReduceW];
            uint reduceWBytes = (uint)bpgReduceW * sizeof(float);
            reduceObjWPtr = cuda.Allocate(reduceWBytes);

            /*
             * reduction size for kernels which operate on alpha
             */
            int reductionSize = problem.ElementsCount;
            threadsForReduceObjAlpha = 0;

            GetNumThreadsAndBlocks(problem.ElementsCount, 64, threadsPerBlock, ref threadsForReduceObjAlpha, ref bpgReduceAlpha);

            uint alphaReductionBytes = (uint)bpgReduceAlpha * sizeof(float);

            /*
             * reduction array for computing objective function value
             */

            reduceObjAlpha    = new float[bpgReduceAlpha];
            reduceObjAlphaPtr = cuda.Allocate(alphaReductionBytes);


            /*
             * reduction arrays for computing BB step
             */
            alphaPartReduce     = new float[bpgReduceAlpha];
            gradPartReduce      = new float[bpgReduceAlpha];
            alphaGradPartReduce = new float[bpgReduceAlpha];

            reduceBBAlphaGradPtr = cuda.Allocate(alphaReductionBytes);
            reduceBBAlphaPtr     = cuda.Allocate(alphaReductionBytes);
            reduceBBGradPtr      = cuda.Allocate(alphaReductionBytes);

            /*
             * reduction arrays for comuting lin part
             */
            reduceLinPart    = new float[bpgReduceAlpha];
            reduceLinPartPtr = cuda.Allocate(alphaReductionBytes);



            //float[] wVec = new float[vecDim];
            wVecMemSize = (uint)vecDim * sizeof(float);
            wTempVecPtr = cuda.Allocate(wVecMemSize);
            //move W wector
            SetTextureMemory(ref cuWVecTexRef, cudaWVecTexRefName, ref wVecPtr, wVecMemSize);

            //set texture memory for labels
            SetTextureMemory(ref cuLabelsTexRef, cudaLabelsTexRefName, sub_prob.Y, ref labelsPtr);


            SetTextureMemory(ref cuDeltasTexRef, "deltasTexRef", ref deltasPtr, alphaMemSize);

            diagPtr = cuda.GetModuleGlobal(cuModule, "diag_shift");


            stepBBPtr = cuda.GetModuleGlobal(cuModule, "stepBB");
            float[] stepData = new float[] { 0.1f };
            cuda.CopyHostToDevice(stepBBPtr, stepData);

            SetCudaParameters(sub_prob);
        }
示例#16
0
        private static unsafe float[] CuDotProdSparseVecStruct()
        {
            int sparseVecSize = sizeof(SparseVecPtr);

            uint size = (uint)(N * sizeof(SparseVecPtr));

            //always the same values
            Random rnd = new Random(1);

            CUDA cuda = new CUDA(0, true);

            // load module
            cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, "structKernel.cubin"));
            //CUfunction structPassFunc = cuda.GetModuleFunction("DotProd");
            CUfunction structPassFunc = cuda.GetModuleFunction("DotProd2");

            SparseVecPtr[] vectors = new SparseVecPtr[N];
            Console.WriteLine("init and copy data");
            Stopwatch t = Stopwatch.StartNew();
            mainIndex = StartingIndex;
            for (int i = 0; i < N; i++)
            {
                vectors[i] = new SparseVecPtr();

                int vecSize = avgElements + i % stdElements;
                vectors[i].size = vecSize;
                float[] vals = Helpers.InitValues(i, vecSize, maxVal);

                int[] index = Helpers.InitIndices(i, vecSize, ref maxIndex);

                CUdeviceptr valsPtr = cuda.CopyHostToDevice(vals);
                CUdeviceptr idxPtr = cuda.CopyHostToDevice(index);

                vectors[i].indices = new IntPtr(idxPtr.Pointer);
                vectors[i].values = (IntPtr)valsPtr.Pointer;
            }

            GCHandle handle = GCHandle.Alloc(vectors, GCHandleType.Pinned);
            IntPtr ptr = handle.AddrOfPinnedObject();

            float[] output = new float[N];

            //CUdeviceptr dVectors = cuda.CopyHostToDevice(vectors);

            CUdeviceptr dVectors = cuda.CopyHostToDevice(ptr, size);
            CUdeviceptr dOutput = cuda.Allocate(output);

            Console.WriteLine("copy and init takes {0}", t.Elapsed);
            #region set cuda parameters
            cuda.SetFunctionBlockShape(structPassFunc, threadsPerBlock, 1, 1);

            int offset = 0;
            cuda.SetParameter(structPassFunc, offset, dVectors.Pointer);
            offset += IntPtr.Size;
            cuda.SetParameter(structPassFunc, offset, dOutput.Pointer);
            offset += IntPtr.Size;
            cuda.SetParameter(structPassFunc, offset, (uint)mainIndex);
            offset += sizeof(int);
            cuda.SetParameter(structPassFunc, offset, (uint)N);
            offset += sizeof(int);
            cuda.SetParameterSize(structPassFunc, (uint)offset);
            #endregion
            Console.WriteLine("start computation");

            CUevent start = cuda.CreateEvent();
            CUevent end = cuda.CreateEvent();

            Stopwatch timer = Stopwatch.StartNew();
            cuda.RecordEvent(start);

            cuda.Launch(structPassFunc, blocksPerGrid, 1);

            cuda.RecordEvent(end);

            cuda.SynchronizeContext();
            //cuda.SynchronizeEvent(end);
            timer.Stop();
            float naiveTime = cuda.ElapsedTime(start, end);

            Console.Write("Dot products with mainIndex {0} and {1}-vectors takes {2} ms stopwatch time {3} ms", mainIndex, N, naiveTime, timer.Elapsed);

            cuda.CopyDeviceToHost(dOutput, output);

            int lenght = Math.Min(displayCount, N);
            Console.WriteLine();
            for (int i = 0; i < lenght; i++)
            {
                Console.WriteLine("{0}-{1}", i, output[i]);
            }

            cuda.Free(dVectors);
            cuda.Free(dOutput);

            return output;
        }
示例#17
0
                private void Worker()
                {
#if CUDA
                    int    nN = _nMergingDeviceNumber;
                    string sS = "CUDA-" + nN + "-" + _nIndex;
                    try
                    {
                        Command cCmd;
                        CUDA    cCUDA;
                        #region CUDA Init
                        try
                        {
                            cCUDA = new CUDA(true);
                            cCUDA.CreateContext(nN % 1000); // number of cuda in prefs (still alwais 0)
                        }
                        catch (Exception ex)
                        {
                            if (ex is CUDAException)
                            {
                                ex = new Exception("CUDA Error:" + ((CUDAException)ex).CUDAError.ToString(), ex);
                            }

                            throw new Exception("CreateContext(" + nN % 1000 + ") error. Try to change CUDA's card number in prefs", ex);
                        }
                        (new Logger(sS)).WriteDebug("CreateContext(" + nN % 1000 + ") is ok!");

                        uint   nMemoryReservedForMerge    = 2 * 1024 * 1024;       //PREFERENCES типа <memory reserved="2097152" />
                        uint   nMemoryStarvationThreshold = cCUDA.TotalMemory / 2; //PREFERENCES через проценты... типа <memory starvation="50%" />
                        uint   nMemoryFree;
                        string sModule = "CUDAFunctions_" + Preferences.nCUDAVersion + "_x" + (IntPtr.Size * 8);
                        if (Logger.bDebug)
                        {
                            (new Logger(sS)).WriteDebug(sModule + "   Current CUDA = [name=" + cCUDA.CurrentDevice.Name + "][compute_capability=" + cCUDA.CurrentDevice.ComputeCapability + "]");
                        }
                        cCUDA.LoadModule((byte[])Properties.Resource.ResourceManager.GetObject(sModule)); //   $(ProjectDir)Resources\CUDAFunctions.cubin
                        CUfunction cCUDAFuncMerge   = cCUDA.GetModuleFunction("CUDAFrameMerge");
                        int        nThreadsPerBlock = 16;                                                 //32 //256 //пришлось уменьшить с 512 до 256 сридов на блок, потому что при добавлении "движения" и операций с float, ловил ошибку: Too Many Resources Requested for Launch (This error means that the number of registers available on the multiprocessor is being exceeded. Reduce the number of threads per block to solve the problem)
                        cCUDA.SetFunctionBlockShape(cCUDAFuncMerge, nThreadsPerBlock, nThreadsPerBlock, 1);
                        CUDADriver.cuParamSetSize(cCUDAFuncMerge, 8);

                        Dictionary <long, CUdeviceptr> ahPMIDs_DevicePointers = new Dictionary <long, CUdeviceptr>();
                        CUdeviceptr cPMs;
                        CUdeviceptr cInfos;
                        CUdeviceptr cAlphaMap;
                        CUdeviceptr cAlphaMap_info3d;
                        CUdeviceptr cAlphaMap_info2d;
                        if (true)
                        {
                            //IntPtr[] aPointersByAlpha = new IntPtr[254];  //те самые поинтеры-альфы. Ссылаются на массивы поинтеров B, т.е. BackGrounds
                            //IntPtr[] aPointersByBackground = new IntPtr[256];   //  те самые массивы поинтеров B, т.е. BackGrounds
                            byte[]   aAlphaMap = new byte[(byte.MaxValue - 1) * (byte.MaxValue + 1) * (byte.MaxValue + 1)];
                            int[]    aAlphaMap_info3d = new int[254];    // начала 2d слоёв
                            ushort[] aAlphaMap_info2d = new ushort[256]; // начала строк в одном 2d
                            int      nResult, nIndx = 0, nIndxInfo = 0, nIndx2d = 0;
                            for (byte nAlpha = 1; 255 > nAlpha; nAlpha++)
                            {
                                aAlphaMap_info3d[nIndxInfo++] = nIndx;
                                for (ushort nBackground = 0; 256 > nBackground; nBackground++)
                                {
                                    if (nAlpha == 1)
                                    {
                                        aAlphaMap_info2d[nIndx2d++] = (ushort)nIndx;
                                    }
                                    for (ushort nForeground = 0; 256 > nForeground; nForeground++)
                                    {
                                        if (255 < (nResult = (int)((float)(nAlpha * (nForeground - nBackground)) / 255 + nBackground + 0.5)))
                                        {
                                            nResult = 255;
                                        }
                                        aAlphaMap[nIndx++] = (byte)nResult;
                                    }
                                    //aPointersByBackground[nBackground] = (IntPtr)cCUDA.CopyHostToDevice<byte>(aResults).Pointer;
                                }
                                //aPointersByAlpha[nAlpha - 1] = (IntPtr)cCUDA.CopyHostToDevice<IntPtr>(aPointersByBackground).Pointer;
                            }
                            cAlphaMap_info3d = cCUDA.CopyHostToDevice <int>(aAlphaMap_info3d);
                            cAlphaMap        = cCUDA.CopyHostToDevice <byte>(aAlphaMap);
                            cAlphaMap_info2d = cCUDA.CopyHostToDevice <ushort>(aAlphaMap_info2d);
                        }
                        CUdeviceptr cAlphaMap2;
                        CUdeviceptr cAlphaMap2_info2d;
                        {
                            byte[]   aAlphaMap2 = new byte[(byte.MaxValue - 1) * (byte.MaxValue - 1)];
                            ushort[] aAlphaMap2_info2d = new ushort[254];
                            int      nIndx = 0, nIndx2d = 0;
                            for (byte nFGColorAlpha = 1; 255 > nFGColorAlpha; nFGColorAlpha++)  // можно использовать симметрию умножения, но х с ней пока
                            {
                                aAlphaMap2_info2d[nIndx2d++] = (ushort)nIndx;
                                for (byte nPixelAlpha = 1; 255 > nPixelAlpha; nPixelAlpha++)
                                {
                                    aAlphaMap2[nIndx++] = (byte)((float)nFGColorAlpha * nPixelAlpha / 255 + 0.5);
                                }
                            }
                            cAlphaMap2        = cCUDA.CopyHostToDevice <byte>(aAlphaMap2);
                            cAlphaMap2_info2d = cCUDA.CopyHostToDevice <ushort>(aAlphaMap2_info2d);
                        }
                        CUdeviceptr cAlphaMap3;
                        CUdeviceptr cAlphaMap3_info2d;
                        {
                            byte[]   aAlphaMap3 = new byte[byte.MaxValue * (byte.MaxValue - 1)];
                            ushort[] aAlphaMap3_info2d = new ushort[255];
                            int      nIndx = 0, nIndx2d = 0;
                            for (ushort nFGColorAlpha = 1; 256 > nFGColorAlpha; nFGColorAlpha++)
                            {
                                aAlphaMap3_info2d[nIndx2d++] = (ushort)nIndx;
                                for (byte nMask = 1; 255 > nMask; nMask++)
                                {
                                    aAlphaMap3[nIndx++] = (byte)(nFGColorAlpha * ((255 - nMask) / 255f) + 0.5);
                                }
                            }
                            cAlphaMap3        = cCUDA.CopyHostToDevice <byte>(aAlphaMap3);
                            cAlphaMap3_info2d = cCUDA.CopyHostToDevice <ushort>(aAlphaMap3_info2d);
                        }
                        #endregion CUDA Init
#if DEBUG
                        Dictionary <long, DateTime> ahDebug   = new Dictionary <long, DateTime>();
                        Dictionary <long, Area>     ahDebugAr = new Dictionary <long, Area>();
#endif
                        DateTime         dtNextTime = DateTime.MinValue, dtNow;
                        bool             bSet;
                        List <IntPtr>    aDPs;
                        List <PixelsMap> aPMs;

                        while (true)
                        {
                            if (1 > aqQueue.CountGet() && (dtNow = DateTime.Now) > dtNextTime)
                            {
                                dtNextTime = dtNow.AddMinutes(20);
#if DEBUG
                                dtNow = dtNow.Subtract(TimeSpan.FromHours(2));
                                string sMessage = "";
                                foreach (long nID in ahDebug.OrderBy(o => o.Value).Select(o => o.Key))
                                {
                                    if (dtNow > ahDebug[nID])
                                    {
                                        sMessage += "<br>[" + nID + " - " + ahDebug[nID].ToString("HH:mm:ss") + "]" + ahDebugAr[nID].ToString();
                                    }
                                }
#endif
                                (new Logger(sS)).WriteDebug("CUDA free memory:" + cCUDA.FreeMemory
#if DEBUG
                                                            + "; possibly timeworn allocations:" + (1 > sMessage.Length ? "no" : sMessage)
#endif
                                                            );
                            }
                            cCmd = aqQueue.Dequeue();  //если нечего отдать - заснёт
                            switch (cCmd.eID)
                            {
                            case Command.ID.Allocate:
                                #region
                                try
                                {
                                    cCmd.cPM._cException = null;
                                    if (1 > cCmd.cPM._nID)
                                    {
                                        if (0 < cCmd.cPM._nBytesQty)
                                        {
                                            nMemoryFree = cCUDA.FreeMemory;
                                            if (nMemoryReservedForMerge < nMemoryFree - cCmd.cPM._nBytesQty)
                                            {
                                                bMemoryStarvation = (nMemoryFree < nMemoryStarvationThreshold);
                                                (new Logger(sS)).WriteDebug3("pixelmap allocateCUDA [current_id=" + _nCurrentID + "]");
                                                cCmd.cPM._nID = System.Threading.Interlocked.Increment(ref _nCurrentID);
                                                ahPMIDs_DevicePointers.Add(cCmd.cPM._nID, cCUDA.Allocate(cCmd.cPM._nBytesQty));
#if DEBUG
                                                ahDebug.Add(cCmd.cPM._nID, DateTime.Now);
                                                ahDebugAr.Add(cCmd.cPM._nID, cCmd.cPM.stArea);
#endif
                                            }
                                            else
                                            {
                                                bMemoryStarvation = true;
                                                throw new Exception("out of memory in CUDA device during Allocate. Only 2 MBytes reserved for the Merge");
                                            }
                                        }
                                        else
                                        {
                                            throw new Exception("bytes quantity in PixelsMap have to be greater than zero for Allocate [_bDisposed = " + cCmd.cPM._bDisposed + "][_bProcessing = " + cCmd.cPM._bProcessing + "][_stPosition.X = " + cCmd.cPM._stPosition.X + "][_stPosition.Y = " + cCmd.cPM._stPosition.Y + "][_bTemp = " + cCmd.cPM._bTemp + "][_dt = " + cCmd.cPM._dtCreate + "][_nBytesQty = " + cCmd.cPM._nBytesQty + "][_nID = " + cCmd.cPM._nID + "][_nShiftTotalX = " + cCmd.cPM._nShiftTotalX + "][_stArea.nHeight = " + cCmd.cPM._stArea.nHeight + "][_stArea.nWidth = " + cCmd.cPM._stArea.nWidth + "][bKeepAlive = " + cCmd.cPM.bKeepAlive + "][eAlpha = " + cCmd.cPM.eAlpha + "][bCUDA = " + cCmd.cPM.stMergingMethod + "][nAlphaConstant = " + cCmd.cPM.nAlphaConstant + "][nID = " + cCmd.cPM.nID + "][nLength = " + cCmd.cPM.nLength + "][stArea.nHeight = " + cCmd.cPM.stArea.nHeight + "][stArea.nWidth = " + cCmd.cPM.stArea.nWidth + "]");
                                        }
                                    }
                                    else
                                    {
                                        throw new Exception("PixelsMap ID have to be zero for Allocate");
                                    }
                                }
                                catch (Exception ex)
                                {
                                    if (ex is CUDAException)
                                    {
                                        ex = new Exception("CUDA Error:" + ((CUDAException)ex).CUDAError.ToString(), ex);
                                    }
                                    (new Logger(sS)).WriteError(ex);
                                    (new Logger(sS)).WriteDebug("bytes qty:" + cCmd.cPM._nBytesQty);
                                    cCmd.cPM._cException = ex;
                                }
                                cCmd.cMRE.Set();
                                break;

                                #endregion
                            case Command.ID.CopyIn:
                                #region
                                try
                                {
                                    cCmd.cPM._cException = null;
                                    if (1 > cCmd.cPM._nID)
                                    {
                                        if (cCUDA.FreeMemory - cCmd.cPM._nBytesQty > nMemoryReservedForMerge)
                                        {
                                            (new Logger(sS)).WriteDebug3("pixelmap copyinCUDA not allocated [pm_id=" + _nCurrentID + "]");
                                            cCmd.cPM._nID = System.Threading.Interlocked.Increment(ref _nCurrentID);
                                            if (cCmd.ahParameters.ContainsKey(typeof(IntPtr)))
                                            {
                                                ahPMIDs_DevicePointers.Add(cCmd.cPM._nID, cCUDA.CopyHostToDevice((IntPtr)cCmd.ahParameters[typeof(IntPtr)], cCmd.cPM._nBytesQty));
                                            }
                                            else if (cCmd.ahParameters.ContainsKey(typeof(byte[])))
                                            {
                                                ahPMIDs_DevicePointers.Add(cCmd.cPM._nID, cCUDA.CopyHostToDevice((byte[])cCmd.ahParameters[typeof(byte[])]));
                                            }
                                            else
                                            {
                                                throw new Exception("unknown parameter type");
                                            }
#if DEBUG
                                            ahDebug.Add(cCmd.cPM._nID, DateTime.Now);
                                            ahDebugAr.Add(cCmd.cPM._nID, cCmd.cPM.stArea);
#endif
                                        }
                                        else
                                        {
                                            throw new Exception("out of memory in CUDA device during CopyIn. Only 2 MBytes reserved for the Merge.");
                                        }
                                    }
                                    else
                                    {
                                        (new Logger(sS)).WriteDebug4("pixelmap copyinCUDA allocated [pm_id=" + _nCurrentID + "]");
                                        if (cCmd.ahParameters.ContainsKey(typeof(IntPtr)))
                                        {
                                            cCUDA.CopyHostToDevice(ahPMIDs_DevicePointers[cCmd.cPM._nID], (IntPtr)cCmd.ahParameters[typeof(IntPtr)], cCmd.cPM._nBytesQty);
                                        }
                                        else if (cCmd.ahParameters.ContainsKey(typeof(byte[])))
                                        {
                                            cCUDA.CopyHostToDevice(ahPMIDs_DevicePointers[cCmd.cPM._nID], (byte[])cCmd.ahParameters[typeof(byte[])]);
                                        }
                                        else
                                        {
                                            throw new Exception("unknown parameter type");
                                        }
                                    }
                                }
                                catch (Exception ex)
                                {
                                    if (ex is CUDAException)
                                    {
                                        ex = new Exception("CUDA Error:" + ((CUDAException)ex).CUDAError.ToString(), ex);
                                    }
                                    (new Logger(sS)).WriteError(ex);
                                    cCmd.cPM._cException = ex;
                                }
                                cCmd.cMRE.Set();
                                #endregion
                                break;

                            case Command.ID.CopyOut:
                                #region
                                try
                                {
                                    if (0 < cCmd.cPM._nID)
                                    {
                                        if (!cCmd.ahParameters.ContainsKey(typeof(IntPtr)))
                                        {
                                            if (cCmd.ahParameters.ContainsKey(typeof(byte[])))
                                            {
                                                byte[] aB = (byte[])cCmd.ahParameters[typeof(byte[])];
                                                cCmd.cPM._aBytes = null;
                                                if (cCmd.cPM._nBytesQty != aB.Length)
                                                {
                                                    (new Logger(sS)).WriteWarning("wrong array size for copyout [got:" + aB.Length + "][expected:" + cCmd.cPM._nBytesQty + "]");
                                                }
                                                cCUDA.CopyDeviceToHost <byte>(ahPMIDs_DevicePointers[cCmd.cPM._nID], aB);
                                            }
                                            else      // не юзается (см. copyout())
                                            {
                                                cCmd.cPM._aBytes = _cBinM.BytesGet((int)cCmd.cPM._nBytesQty, 3);
                                                cCUDA.CopyDeviceToHost <byte>(ahPMIDs_DevicePointers[cCmd.cPM._nID], cCmd.cPM._aBytes.aBytes);
                                            }
                                        }
                                        else
                                        {
                                            cCUDA.CopyDeviceToHost(ahPMIDs_DevicePointers[cCmd.cPM._nID], (IntPtr)cCmd.ahParameters[typeof(IntPtr)], cCmd.cPM._nBytesQty);
                                        }
                                        (new Logger(sS)).WriteDebug5("copy out [id:" + cCmd.cPM._nID + "][ptr:" + ahPMIDs_DevicePointers[cCmd.cPM._nID].Pointer + "]");
                                    }
                                    else
                                    {
                                        throw new Exception("PixelsMap have to be allocated for CopyOut");
                                    }
                                }
                                catch (Exception ex)
                                {
                                    if (ex is CUDAException)
                                    {
                                        ex = new Exception("CUDA Error:" + ((CUDAException)ex).CUDAError.ToString(), ex);
                                    }
                                    (new Logger(sS)).WriteError(ex);
                                    cCmd.cPM._cException = ex;
                                }
                                cCmd.cMRE.Set();
                                #endregion
                                break;

                            case Command.ID.Merge:
                                #region
                                bSet = false;
                                try
                                {
                                    aPMs = (List <PixelsMap>)cCmd.ahParameters[typeof(List <PixelsMap>)];
                                    DisCom.MergeInfo cMergeInfo = (DisCom.MergeInfo)cCmd.ahParameters[typeof(DisCom.MergeInfo)];
                                    aDPs = new List <IntPtr>();

                                    if (1 > cCmd.cPM._nID)
                                    {
                                        throw new Exception("background PixelsMap have to be allocated for Merge");
                                    }

                                    aDPs.Add((IntPtr)ahPMIDs_DevicePointers[cCmd.cPM._nID].Pointer);
                                    for (int nIndx = 0; nIndx < aPMs.Count; nIndx++)
                                    {
                                        if (!ahPMIDs_DevicePointers.ContainsKey(aPMs[nIndx]._nID))
                                        {
                                            throw new Exception("there is a corrupted ID in layers for merge [id:" + aPMs[nIndx]._nID + "]");
                                        }
                                        if (1 > ahPMIDs_DevicePointers[aPMs[nIndx]._nID].Pointer)
                                        {
                                            throw new Exception("there is an empty pointer in layers for merge [id:" + aPMs[nIndx]._nID + "]");
                                        }
                                        aDPs.Add((IntPtr)ahPMIDs_DevicePointers[aPMs[nIndx]._nID].Pointer);
                                    }

                                    cPMs   = cCUDA.CopyHostToDevice <IntPtr>(aDPs.ToArray());
                                    cInfos = cCUDA.CopyHostToDevice(cMergeInfo, cMergeInfo.SizeGet());      // operator intptr in DisCom.MergeInfo

                                    cCUDA.SetParameter <IntPtr>(cCUDAFuncMerge, 0, (IntPtr)cPMs.Pointer);
                                    cCUDA.SetParameter <IntPtr>(cCUDAFuncMerge, IntPtr.Size, (IntPtr)cInfos.Pointer);
                                    cCUDA.SetParameter <IntPtr>(cCUDAFuncMerge, IntPtr.Size * 2, (IntPtr)cAlphaMap.Pointer);         //
                                    cCUDA.SetParameter <IntPtr>(cCUDAFuncMerge, IntPtr.Size * 3, (IntPtr)cAlphaMap_info3d.Pointer);  //
                                    cCUDA.SetParameter <IntPtr>(cCUDAFuncMerge, IntPtr.Size * 4, (IntPtr)cAlphaMap_info2d.Pointer);  //
                                    cCUDA.SetParameter <IntPtr>(cCUDAFuncMerge, IntPtr.Size * 5, (IntPtr)cAlphaMap2.Pointer);
                                    cCUDA.SetParameter <IntPtr>(cCUDAFuncMerge, IntPtr.Size * 6, (IntPtr)cAlphaMap2_info2d.Pointer); //
                                    cCUDA.SetParameter <IntPtr>(cCUDAFuncMerge, IntPtr.Size * 7, (IntPtr)cAlphaMap3.Pointer);
                                    cCUDA.SetParameter <IntPtr>(cCUDAFuncMerge, IntPtr.Size * 8, (IntPtr)cAlphaMap3_info2d.Pointer); //
                                    cCUDA.SetParameterSize(cCUDAFuncMerge, (uint)(IntPtr.Size * 9));
                                    int nIterationsX = (0 == cMergeInfo.nBackgroundWidth % nThreadsPerBlock ? cMergeInfo.nBackgroundWidth / nThreadsPerBlock : cMergeInfo.nBackgroundWidth / nThreadsPerBlock + 1);
                                    int nIterationsY = (0 == cMergeInfo.nBackgroundHight % nThreadsPerBlock ? cMergeInfo.nBackgroundHight / nThreadsPerBlock : cMergeInfo.nBackgroundHight / nThreadsPerBlock + 1);
                                    //int nIterationsX = (0 == cMergeInfo.nBackgroundHight % nThreadsPerBlock ? cMergeInfo.nBackgroundHight / nThreadsPerBlock : cMergeInfo.nBackgroundHight / nThreadsPerBlock + 1);

                                    cCUDA.Launch(cCUDAFuncMerge, nIterationsX, nIterationsY);



                                    cCUDA.Free(cPMs);
                                    cCUDA.Free(cInfos);

                                    cCmd.cMRE.Set();
                                    bSet = true;

                                    cMergeInfo.Dispose();
                                    for (int nIndx = 0; nIndx < aPMs.Count; nIndx++)
                                    {
                                        lock (aPMs[nIndx]._cSyncRoot)
                                            aPMs[nIndx]._bProcessing = false;
                                        aPMs[nIndx].Dispose();
                                    }
                                }
                                catch (Exception ex)
                                {
                                    cCmd.cPM._cException = ex;
                                    if (!bSet)
                                    {
                                        cCmd.cMRE.Set();
                                    }
                                    if (ex is CUDAException)
                                    {
                                        ex = new Exception("CUDA Error:" + ((CUDAException)ex).CUDAError.ToString(), ex);
                                    }
                                    (new Logger(sS)).WriteError(ex);
                                }
                                #endregion
                                break;

                            case Command.ID.Dispose:
                                #region
                                (new Logger(sS)).Write(Logger.Level.debug4, "dispose: in");
                                try
                                {
                                    if (ahPMIDs_DevicePointers.ContainsKey(cCmd.cPM._nID))
                                    {
                                        if (0 < cCmd.cPM._nID && 0 < ahPMIDs_DevicePointers[cCmd.cPM._nID].Pointer)
                                        {
                                            cCUDA.Free(ahPMIDs_DevicePointers[cCmd.cPM._nID]);
                                            //cCUDA.SynchronizeContext();
                                            bMemoryStarvation = (cCUDA.FreeMemory < nMemoryStarvationThreshold);
                                            (new Logger(sS)).WriteDebug3("dispose [id:" + cCmd.cPM._nID + "][ptr:" + ahPMIDs_DevicePointers[cCmd.cPM._nID].Pointer + "]");
                                        }
                                        ahPMIDs_DevicePointers.Remove(cCmd.cPM._nID);
#if DEBUG
                                        ahDebug.Remove(cCmd.cPM._nID);
                                        ahDebugAr.Remove(cCmd.cPM._nID);
#endif
                                        cCmd.cPM._nID = 0;
                                    }
                                }
                                catch (Exception ex)
                                {
                                    if (ex is CUDAException)
                                    {
                                        ex = new Exception("CUDA Error:" + ((CUDAException)ex).CUDAError.ToString(), ex);
                                    }
                                    (new Logger(sS)).WriteError(ex);
                                    cCmd.cPM._cException = ex;
                                }
                                (new Logger(sS)).Write(Logger.Level.debug4, "dispose: out");
                                #endregion
                                break;
                            }
                        }
                    }
                    catch (Exception ex)
                    {
                        if (ex is CUDAException)
                        {
                            ex = new Exception("CUDA Error:" + ((CUDAException)ex).CUDAError.ToString(), ex);
                        }
                        (new Logger(sS)).WriteError("CUDA STOPPED!!!! [id = " + _nIndex + "]", ex);
                    }
#endif
                }
示例#18
0
        private static float[] CuRBFCSRCached()
        {
            //always the same values
            Random rnd = new Random(1);

            CUDA cuda = new CUDA(0, true);

            // load module
            CUmodule module = cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, "structKernel.cubin"));

            CUfunction structPassFunc = cuda.GetModuleFunction("RBFspmv_csr_vector");

            int maxRowSize = avgElements + stdElements - 1;

            Console.WriteLine("init arrays");
            Stopwatch t = Stopwatch.StartNew();
            List<float> vecValsL = new List<float>(N * maxRowSize / 2);
            List<int> vecIdxL = new List<int>(N * maxRowSize / 2);
            List<int> vecLenghtL = new List<int>(N);

            float[] vecVals;
            int[] vecIdx;
            int[] vecLenght;
            float[] selfDot = new float[N];

            maxIndex = 0;
            int vecStartIdx = 0;
            for (int i = 0; i < N; i++)
            {
                int vecSize = avgElements + i % stdElements;

                float[] vals = Helpers.InitValues(i, vecSize, maxVal);
                vecValsL.AddRange(vals);

                for (int z = 0; z < vals.Length; z++)
                {
                    selfDot[i] += vals[z] * vals[z];
                }
                int[] index = Helpers.InitIndices(i, vecSize, ref maxIndex);
                vecIdxL.AddRange(index);

                vecLenghtL.Add(vecStartIdx);
                vecStartIdx += vecSize;

            }
            //for last index
            vecLenghtL.Add(vecStartIdx);

            vecVals = vecValsL.ToArray();
            vecIdx = vecIdxL.ToArray();
            vecLenght = vecLenghtL.ToArray();

            float[] mainVec = new float[maxIndex + 1];

            for (int j = vecLenght[mainIndex]; j < vecLenght[mainIndex + 1]; j++)
            {
                int idx = vecIdx[j];
                float val = vecVals[j];
                mainVec[idx] = val;
            }
            Console.WriteLine("Init takes {0}", t.Elapsed);
            t.Start();

            CUdeviceptr valsPtr = cuda.CopyHostToDevice(vecVals);
            CUdeviceptr idxPtr = cuda.CopyHostToDevice(vecIdx);
            CUdeviceptr vecLenghtPtr = cuda.CopyHostToDevice(vecLenght);
            CUdeviceptr selfDotPtr = cuda.CopyHostToDevice(selfDot);

            //copy to texture
            CUarray cuArr = cuda.CreateArray(mainVec);
            cuda.CopyHostToArray(cuArr, mainVec, 0);
            CUtexref cuTexRef = cuda.GetModuleTexture(module, "texRef");
            cuda.SetTextureFlags(cuTexRef, 0);
            cuda.SetTextureArray(cuTexRef, cuArr);

            float[] output = new float[N];
            CUdeviceptr dOutput = cuda.Allocate(output);

            Console.WriteLine("copy to device takes {0}", t.Elapsed);

            cuda.SetFunctionBlockShape(structPassFunc, threadsPerBlock, 1, 1);

            int offset = 0;
            cuda.SetParameter(structPassFunc, offset, valsPtr.Pointer);
            offset += IntPtr.Size;
            cuda.SetParameter(structPassFunc, offset, idxPtr.Pointer);
            offset += IntPtr.Size;

            cuda.SetParameter(structPassFunc, offset, vecLenghtPtr.Pointer);
            offset += IntPtr.Size;
            cuda.SetParameter(structPassFunc, offset, selfDotPtr.Pointer);
            offset += IntPtr.Size;

            cuda.SetParameter(structPassFunc, offset, dOutput.Pointer);
            offset += IntPtr.Size;

            cuda.SetParameter(structPassFunc, offset, (uint)N);
            offset += sizeof(int);
            cuda.SetParameter(structPassFunc, offset, (uint)mainIndex);
            offset += sizeof(int);
            cuda.SetParameter(structPassFunc, offset, Gamma);
            offset += sizeof(float);

            cuda.SetParameter(structPassFunc, offset, (uint)vecStartIdx);
            offset += sizeof(int);
            cuda.SetParameterSize(structPassFunc, (uint)offset);

            Console.WriteLine("start computation");

            CUevent start = cuda.CreateEvent();
            CUevent end = cuda.CreateEvent();

            Stopwatch timer = Stopwatch.StartNew();
            cuda.RecordEvent(start);
            cuda.Launch(structPassFunc, blocksPerGrid, 1);

            cuda.RecordEvent(end);

            cuda.SynchronizeContext();
            //cuda.SynchronizeEvent(end);
            timer.Stop();
            float naiveTime = cuda.ElapsedTime(start, end);

            Console.Write("csr vector Dot products with mainIndex {0} and {1}-vectors takes {2} ms stopwatch time {3} ms", mainIndex, N, naiveTime, timer.Elapsed);

            cuda.CopyDeviceToHost(dOutput, output);

            int lenght = Math.Min(displayCount, N);
            Console.WriteLine();
            for (int i = 0; i < lenght; i++)
            {
                Console.WriteLine("{0}-{1}", i, output[i]);
            }

            cuda.Free(valsPtr);
            cuda.Free(idxPtr);
            cuda.Free(dOutput);
            cuda.Free(selfDotPtr);
            cuda.Free(vecLenghtPtr);
            cuda.DestroyArray(cuArr);
            cuda.DestroyTexture(cuTexRef);
            cuda.DestroyEvent(start);
            cuda.DestroyEvent(end);

            return output;
        }
示例#19
0
        // It's interesting to change the number of blocks and the number of threads to 
        // understand how to keep the hardware busy.
        //
        // Here are some numbers I get on my G80:
        //    blocks - clocks
        //    1 - 3096
        //    8 - 3232
        //    16 - 3364
        //    32 - 4615
        //    64 - 9981
        //
        // With less than 16 blocks some of the multiprocessors of the device are idle. With
        // more than 16 you are using all the multiprocessors, but there's only one block per
        // multiprocessor and that doesn't allow you to hide the latency of the memory. With
        // more than 32 the speed scales linearly.
        static void Main(string[] args)
        {
            // Init CUDA, select 1st device.
            CUDA cuda = new CUDA(0, true);

            // load module
            cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, "clock_kernel.cubin"));
            CUfunction func = cuda.GetModuleFunction("timedReduction");

            int[] timer = new int[NUM_BLOCKS * 2];
            float[] input = new float[NUM_THREADS * 2];

            for (int i = 0; i < NUM_THREADS * 2; i++)
            {
                input[i] = (float)i;
            }

            CUdeviceptr dinput = cuda.CopyHostToDevice<float>(input);
            CUdeviceptr doutput = cuda.Allocate((uint)(sizeof(float) * NUM_BLOCKS));
            CUdeviceptr dtimer = cuda.Allocate<int>(timer);

            cuda.SetParameter(func, 0, (uint)dinput.Pointer);
            cuda.SetParameter(func, IntPtr.Size, (uint)doutput.Pointer);
            cuda.SetParameter(func, IntPtr.Size*2, (uint)dtimer.Pointer);
            cuda.SetParameterSize(func, (uint)(IntPtr.Size*3));

            //timedReduction<<<NUM_BLOCKS, NUM_THREADS, sizeof(float) * 2 * NUM_THREADS>>>(dinput, doutput, dtimer);
            cuda.SetFunctionBlockShape(func, NUM_THREADS, 1, 1);
            cuda.SetFunctionSharedSize(func, (uint)(sizeof(float) * 2 * NUM_THREADS));
            cuda.Launch(func, NUM_BLOCKS, 1);

            cuda.CopyDeviceToHost<int>(dtimer, timer);

            cuda.Free(dinput);
            cuda.Free(doutput);
            cuda.Free(dtimer);

            foreach (int i in timer)
            {
                Console.WriteLine(i);
            }

            Console.WriteLine("Test PASSED");

            int minStart = timer[0];
            int maxEnd = timer[NUM_BLOCKS];
            for (int i = 1; i < NUM_BLOCKS; i++)
            {
                minStart = timer[i] < minStart ? timer[i] : minStart;
                maxEnd = timer[NUM_BLOCKS + i] > maxEnd ? timer[NUM_BLOCKS + i] : maxEnd;
            }

            Console.WriteLine("time = {0}", maxEnd - minStart);
        }
示例#20
0
        private void SetCudaData(Problem <SparseVec> sub_prob)
        {
            int vecDim = sub_prob.Elements[0].Dim;

            /*
             * copy vectors to CUDA device
             */
            float[] vecVals;
            int[]   vecIdx;
            int[]   vecLenght;
            CudaHelpers.TransformToCSRFormat(out vecVals, out vecIdx, out vecLenght, sub_prob.Elements);
            valsCSRPtr      = cuda.CopyHostToDevice(vecVals);
            idxCSRPtr       = cuda.CopyHostToDevice(vecIdx);
            vecLenghtCSRPtr = cuda.CopyHostToDevice(vecLenght);


            CudaHelpers.TransformToCSCFormat(out vecVals, out vecIdx, out vecLenght, sub_prob.Elements);
            valsCSCPtr      = cuda.CopyHostToDevice(vecVals);
            idxCSCPtr       = cuda.CopyHostToDevice(vecIdx);
            vecLenghtCSCPtr = cuda.CopyHostToDevice(vecLenght);



            /*
             * allocate memory for gradient
             */
            uint memSize = (uint)(sub_prob.ElementsCount * sizeof(float));

            //allocate mapped memory for our results (dot product beetween vector W and all elements)
            gradIntPtr = cuda.HostAllocate(memSize, CUDADriver.CU_MEMHOSTALLOC_DEVICEMAP);
            gradPtr    = cuda.GetHostDevicePointer(gradIntPtr, 0);

            //allocate memory for main vector, size of this vector is the same as dimenson, so many
            //indexes will be zero, but cuda computation is faster
            mainVector = new float[vecDim];


            //move W wector
            //CudaHelpers.FillDenseVector(problemElements[0], mainVector);
            CudaHelpers.SetTextureMemory(cuda, cuModule, ref cuMainVecTexRef, cudaMainVecTexRefName, mainVector, ref mainVecPtr);


            //set texture memory for labels
            CudaHelpers.SetTextureMemory(cuda, cuModule, ref cuLabelsTexRef, cudaLabelsTexRefName, sub_prob.Y, ref labelsPtr);


            /*
             * data for cuda solver
             */

            //normaly for L2 solver QDii= xii*xii+Diag_i
            //where Diag_i = 0.5/Cp if yi=1
            //      Diag_i = 0.5/Cn if yi=-1
            //but we will add this on GPU
            QD     = new float[sub_prob.ElementsCount];
            alpha  = new float[sub_prob.ElementsCount];
            deltas = new float[sub_prob.ElementsCount];
            float[] diag = new float[3];
            for (int i = 0; i < sub_prob.ElementsCount; i++)
            {
                QD[i]     = sub_prob.Elements[i].DotProduct();
                alpha[i]  = 0f;
                deltas[i] = 0;
            }

            qdPtr = cuda.CopyHostToDevice(QD);

            alphaPtr = cuda.Allocate(alpha);


            //deltasPtr = cuda.Allocate(deltas);
            CudaHelpers.SetTextureMemory(cuda, cuModule, ref cuDeltasTexRef, "deltasTexRef", deltas, ref deltasPtr);

            diagPtr = cuda.GetModuleGlobal(cuModule, "diag_shift");
            //set this in fill function
            //cuda.CopyHostToDevice(diagPtr, diag);

            //CUdeviceptr dimPtr = cuda.GetModuleGlobal(cuModule, "Dim");
            ////todo: check if it ok
            ////cuda.Memset(dimPtr,(uint) vecDim, 1);
            //int[] dimArr = new int[] { vecDim };
            //cuda.CopyHostToDevice(dimPtr,dimArr);

            //CUDARuntime.cudaMemcpyToSymbol("Dim", dimPtr, 1, 0, cudaMemcpyKind.cudaMemcpyHostToDevice);
            //CUDARuntime.cudaMemcpyToSymbol("Dim", ,1,0, cudaMemcpyKind.cudaMemcpyHostToDevice);

            CUdeviceptr deltaScalingPtr = cuda.GetModuleGlobal(cuModule, "stepScaling");

            //two ways of computing scaling param, should be the same, but it depends on rounding.
            //stepScaling = (float)(1.0 / Math.Sqrt(sub_prob.ElementsCount));

            stepScaling = 0.0002f;// (float)(1.0 / sub_prob.ElementsCount);

            //set scaling constant
            float[] scArr = new float[] { stepScaling };
            cuda.CopyHostToDevice(deltaScalingPtr, scArr);
            //cuda.Memset(deltaScalingPtr, (uint) scaling,sizeof(float));

            //cuda.CopyHostToDevice(dimPtr, problem.Elements[0].Dim);

            SetCudaParameters(sub_prob);
        }
示例#21
0
        static void Main(string[] args)
        {
            // Create a new instance of CUDA class, select 1st device.
            CUDA cuda = new CUDA(0, true);

            // Prepare parameters.
            int n = 16 * 1024 * 1024;
            uint nbytes = (uint)(n * sizeof(int));
            int value = 26;

            // allocate host memory
            int[] a = new int[n];

            // allocate device memory
            CUdeviceptr d_a = cuda.Allocate<int>(a);
            CUDADriver.cuMemsetD8(d_a, 0xff, nbytes);

            // load module
            cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, "asyncAPI.ptx"));
            CUfunction func = cuda.GetModuleFunction("increment_kernel");

            // set kernel launch configuration
            cuda.SetFunctionBlockShape(func, 512, 1, 1);

            // create cuda event handles
            CUevent start = cuda.CreateEvent();
            CUevent stop = cuda.CreateEvent();

            // asynchronously issue work to the GPU (all to stream 0)
            CUstream stream = new CUstream();
            cuda.RecordEvent(start);
            cuda.CopyHostToDeviceAsync<int>(d_a, a, stream);

            // set parameters for kernel function
            cuda.SetParameter(func, 0, (uint)d_a.Pointer);
            cuda.SetParameter(func, IntPtr.Size, (uint)value);

            cuda.SetParameterSize(func, (uint)(IntPtr.Size + 4));

            // actually launch kernel
            cuda.LaunchAsync(func, n / 512, 1, stream);

            // wait for every thing to finish, then start copy back data
            cuda.CopyDeviceToHostAsync<int>(d_a, a, stream);

            cuda.RecordEvent(stop);

            // print the cpu and gpu times
            Console.WriteLine("time spent executing by the GPU: {0} ms", cuda.ElapsedTime(start, stop));

            // check the output for correctness
            if (CorrectOutput(a, value))
                Console.WriteLine("Test PASSED");
            else
                Console.WriteLine("Test FAILED");

            // release resources
            cuda.DestroyEvent(start);
            cuda.DestroyEvent(stop);
            cuda.Free(d_a);
        }
示例#22
0
		public unsafe void InitTasks()
		{
			bool doMidside = channels == 2 && eparams.do_midside;
			int channelCount = doMidside ? 2 * channels : channels;

			if (!inited)
			{
				cuda = new CUDA(true, InitializationFlags.None);
				cuda.CreateContext(0, CUCtxFlags.SchedAuto);
				using (Stream cubin = GetType().Assembly.GetManifestResourceStream(GetType(), "flacuda.cubin"))
				using (StreamReader sr = new StreamReader(cubin))
					cuda.LoadModule(new ASCIIEncoding().GetBytes(sr.ReadToEnd()));
				//cuda.LoadModule(System.IO.Path.Combine(Environment.CurrentDirectory, "flacuda.cubin"));
				if (_IO == null)
					_IO = new FileStream(_path, FileMode.Create, FileAccess.Write, FileShare.Read);
				int header_size = flake_encode_init();
				_IO.Write(header, 0, header_size);
				if (_IO.CanSeek)
					first_frame_offset = _IO.Position;

				task1 = new FlaCudaTask(cuda, channelCount, channels, bits_per_sample, max_frame_size, _settings.DoVerify);
				task2 = new FlaCudaTask(cuda, channelCount, channels, bits_per_sample, max_frame_size, _settings.DoVerify);
				if (_settings.CPUThreads > 0)
				{
					cpu_tasks = new FlaCudaTask[_settings.CPUThreads];
					for (int i = 0; i < cpu_tasks.Length; i++)
						cpu_tasks[i] = new FlaCudaTask(cuda, channelCount, channels, bits_per_sample, max_frame_size, _settings.DoVerify);
				}
				cudaWindow = cuda.Allocate((uint)sizeof(float) * FlaCudaWriter.MAX_BLOCKSIZE * 2 * lpc.MAX_LPC_WINDOWS);

				inited = true;
			}
		}