Example #1
0
        private void InitCudaModule()
        {
            cuda = gpuKernel.cuda;

            //cuda = new CUDA(0, true);
            //cuCtx = cuda.CreateContext(0, CUCtxFlags.MapHost);
            //cuda.SetCurrentContext(cuCtx);

            string modluePath = Path.Combine(Environment.CurrentDirectory, cudaModuleName);

            if (!File.Exists(modluePath))
            {
                throw new ArgumentException("Failed access to cuda module" + modluePath);
            }

            cuModule           = cuda.LoadModule(modluePath);
            cuFuncFindMaxIMinJ = cuda.GetModuleFunction(funcFindMaxIMinJ);

            cuFuncUpdateG = cuda.GetModuleFunction(funcUpdateGFunc);
        }
Example #2
0
 public static extern CUResult cuParamSetv(CUfunction hfunc, int offset, ref double ptr, uint numbytes);
Example #3
0
 public static extern CUResult cuParamSetv(CUfunction hfunc, int offset, ref long value, uint numbytes);
Example #4
0
 public static extern CUResult cuParamSetTexRef(CUfunction hfunc, int texunit, CUtexref hTexRef);
Example #5
0
 public static extern CUResult cuFuncGetAttribute(ref int pi, CUFunctionAttribute attrib, CUfunction hfunc);
Example #6
0
        static void Main(string[] args)
        {
            // Init and select 1st device.
            CUDA cuda = new CUDA(0, true);

            // load module
            //cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, "simpleCUFFT.ptx"));
            CUfunction func = new CUfunction();// cuda.GetModuleFunction("ComplexPointwiseMulAndScale");

            // The filter size is assumed to be a number smaller than the signal size
            const int SIGNAL_SIZE = 50;
            const int FILTER_KERNEL_SIZE = 11;

            // Allocate host memory for the signal
            Float2[] h_signal = new Float2[SIGNAL_SIZE];
            // Initalize the memory for the signal
            Random r = new Random();
            for (int i = 0; i < SIGNAL_SIZE; ++i)
            {
                h_signal[i].x = r.Next() / (float)int.MaxValue;
                h_signal[i].y = 0;
            }

            // Allocate host memory for the filter
            Float2[] h_filter_kernel = new Float2[FILTER_KERNEL_SIZE];
            // Initalize the memory for the filter
            for (int i = 0; i < FILTER_KERNEL_SIZE; ++i)
            {
                h_filter_kernel[i].x = r.Next() / (float)int.MaxValue;
                h_filter_kernel[i].y = 0;
            }

            // Pad signal and filter kernel
            Float2[] h_padded_signal;
            Float2[] h_padded_filter_kernel;
            int new_size = PadData(h_signal, out h_padded_signal, SIGNAL_SIZE,
                                   h_filter_kernel, out h_padded_filter_kernel, FILTER_KERNEL_SIZE);

            // Allocate device memory for signal
            // Copy host memory to device
            CUdeviceptr d_signal = cuda.CopyHostToDevice<Float2>(h_padded_signal);

            // Allocate device memory for filter kernel
            // Copy host memory to device
            CUdeviceptr d_filter_kernel = cuda.CopyHostToDevice<Float2>(h_padded_filter_kernel);

            // CUFFT plan
            CUFFT fft = new CUFFT(cuda);
            cufftHandle handle = new cufftHandle();
            CUFFTResult fftres = CUFFTDriver.cufftPlan1d(ref handle, new_size, CUFFTType.C2C, 1);
            //fft.Plan1D(new_size, CUFFTType.C2C, 1);


            return;

            // Transform signal and kernel
            fft.ExecuteComplexToComplex(d_signal, d_signal, CUFFTDirection.Forward);
            fft.ExecuteComplexToComplex(d_filter_kernel, d_filter_kernel, CUFFTDirection.Forward);

            // Multiply the coefficients together and normalize the result
            // ComplexPointwiseMulAndScale<<<32, 256>>>(d_signal, d_filter_kernel, new_size, 1.0f / new_size);
            cuda.SetFunctionBlockShape(func, 256, 1, 1);
            cuda.SetParameter(func, 0, (uint)d_signal.Pointer);
            cuda.SetParameter(func, IntPtr.Size, (uint)d_filter_kernel.Pointer);
            cuda.SetParameter(func, IntPtr.Size * 2, (uint)new_size);
            cuda.SetParameter(func, IntPtr.Size * 2 + 4, 1.0f / new_size);
            cuda.SetParameterSize(func, (uint)(IntPtr.Size * 2 + 8));
            cuda.Launch(func, 32, 1);

            // Transform signal back
            fft.ExecuteComplexToComplex(d_signal, d_signal, CUFFTDirection.Inverse);

            // Copy device memory to host
            Float2[] h_convolved_signal = h_padded_signal;
            cuda.CopyDeviceToHost<Float2>(d_signal, h_convolved_signal);

            // Allocate host memory for the convolution result
            Float2[] h_convolved_signal_ref = new Float2[SIGNAL_SIZE];

            // Convolve on the host
            Convolve(h_signal, SIGNAL_SIZE,
                     h_filter_kernel, FILTER_KERNEL_SIZE,
                     h_convolved_signal_ref);

            // check result
            bool res = cutCompareL2fe(h_convolved_signal_ref, h_convolved_signal, 2 * SIGNAL_SIZE, 1e-5f);
            Console.WriteLine("Test {0}", (true == res) ? "PASSED" : "FAILED");

            //Destroy CUFFT context
            fft.Destroy();

            // cleanup memory
            cuda.Free(d_signal);
            cuda.Free(d_filter_kernel);
        }
        public override void DoLayout()
        {
            CUdeviceptr p1 = new CUdeviceptr();

            CUDADriver.cuMemAlloc(ref p1, 1 <<10);
            byte[] b = new byte[1<<10];
            CUDADriver.cuMemcpyHtoD(p1, b, (uint) b.Length);

            CUfunction func = new CUfunction();
            CUResult res;

            int nnodes = (int) Network.VertexCount*2;
            int blocks = 32;

            if (nnodes < 1024*blocks) nnodes = 1024*blocks;
            while ((nnodes & (prop.SIMDWidth-1)) != 0) nnodes++;
                nnodes--;

            //float dtime = 0.025f;  float dthf = dtime * 0.5f;
            //float epssq = 0.05f * 0.05f;
            //float itolsq = 1.0f / (0.5f * 0.5f);

            CUDADriver.cuModuleGetFunction(ref func, mod, "dummy");

            // Float4[] data = new Float4[100];
            CUdeviceptr ptr = new CUdeviceptr();
            //CUDADriver.cuMemAlloc(ref ptr, (uint) 100 * System.Runtime.InteropServices.Marshal.SizeOf(Float4));
            CUDADriver.cuParamSeti(func, 0, (uint) ptr.Pointer);
            CUDADriver.cuParamSetSize(func, 4);

            res = CUDADriver.cuLaunch(func);
            if(res != CUResult.Success)
                Logger.AddMessage(LogEntryType.Warning, "CUDA Error in dummy function: " +res.ToString());

            // InitializationKernel<<<1, 1>>>();
            CUDADriver.cuModuleGetFunction(ref func, mod, "InitializationKernel");
            res = CUDADriver.cuLaunch(func);
            if(res != CUResult.Success)
                Logger.AddMessage(LogEntryType.Warning, "CUDA Error in InitializationKernel: " +res.ToString());

            // BoundingBoxKernel<<<blocks * FACTOR1, THREADS1>>>();
            CUDADriver.cuModuleGetFunction(ref func, mod, "BoundingBoxKernel: "+res.ToString());
            CUDADriver.cuLaunch(func);
            if(res != CUResult.Success)
                Logger.AddMessage(LogEntryType.Warning, "CUDA Error in BoundingBoxKernel: "+res.ToString());

            // TreeBuildingKernel<<<blocks * FACTOR2, THREADS2>>>();
            CUDADriver.cuModuleGetFunction(ref func, mod, "TreeBuildingKernel: "+res.ToString());
            CUDADriver.cuLaunch(func);
            if(res != CUResult.Success)
                Logger.AddMessage(LogEntryType.Warning, "CUDA Error in TreeBuildingKernel: "+res.ToString());

            // SummarizationKernel<<<blocks * FACTOR3, THREADS3>>>();
            CUDADriver.cuModuleGetFunction(ref func, mod, "SummarizationKernel: "+res.ToString());
            CUDADriver.cuLaunch(func);
            if(res != CUResult.Success)
                Logger.AddMessage(LogEntryType.Warning, "CUDA Error in SummarizationKernel: "+res.ToString());

            // ForceCalculationKernel<<<blocks * FACTOR5, THREADS5>>>();
            CUDADriver.cuModuleGetFunction(ref func, mod, "ForceCalculationKernel: "+res.ToString());
            CUDADriver.cuLaunch(func);
            if(res != CUResult.Success)
                Logger.AddMessage(LogEntryType.Warning, "CUDA Error in ForceCalculationKernel: "+res.ToString());

            // IntegrationKernel<<<blocks * FACTOR6, THREADS6>>>();
            CUDADriver.cuModuleGetFunction(ref func, mod, "IntegrationKernel");
            CUDADriver.cuLaunch(func);
            if(res != CUResult.Success)
                Logger.AddMessage(LogEntryType.Warning, "CUDA Error in IntegrationKernel: "+res.ToString());
        }
        private void InitCudaModule()
        {
            string modluePath = Path.Combine(Environment.CurrentDirectory, cudaModuleName);
            if (!File.Exists(modluePath))
                throw new ArgumentException("Failed access to cuda module" + modluePath);

            cuModule = cuda.LoadModule(modluePath);
            cuFuncDense = cuda.GetModuleFunction(funcName);
        }
Example #9
0
 public static extern CUResult cuLaunchGridAsync(CUfunction f, int grid_width, int grid_height, CUstream hStream);
Example #10
0
 public static extern CUResult cuLaunchGrid(CUfunction f, int grid_width, int grid_height);
Example #11
0
 public static extern CUResult cuLaunch(CUfunction f);
Example #12
0
 public static extern CUResult cuFuncSetSharedSize(CUfunction hfunc, uint bytes);
Example #13
0
 public static extern CUResult cuFuncSetCacheConfig(CUfunction hfunc, CUFunctionCache config);
Example #14
0
 public static extern CUResult cuFuncSetBlockShape(CUfunction hfunc, int x, int y, int z);
Example #15
0
 public static extern CUResult cuParamSetv(CUfunction hfunc, int offset, [In] Short1[] ptr, uint numbytes);
Example #16
0
        //private double ComputeObj(float[] w, float[] alpha, Problem<SparseVec> sub_prob, float[] diag)
        //{
        //    double v = 0, v1=0;
        //    int nSV = 0;
        //    for (int i = 0; i < w.Length; i++)
        //    {
        //        v += w[i] * w[i];
        //        v1 += 0.5*w[i] * w[i];
        //    }
        //    for (int i = 0; i < alpha.Length; i++)
        //    {
        //        sbyte y_i = (sbyte)sub_prob.Y[i];

        //        //original line
        //        //v += alpha[i] * (alpha[i] * diag[GETI(y_i, i)] - 2);
        //        v += alpha[i] * (alpha[i] * diag[y_i + 1] - 2);
        //        v1 += 0.5* alpha[i] * (alpha[i] * diag[y_i + 1] - 2);
        //        if (alpha[i] > 0) ++nSV;
        //    }

        //    v = v / 2;
        //  //  Debug.WriteLine("Objective value = {0}", v);
        //  //  Debug.WriteLine("nSV = {0}", nSV);

        //    return v;
        //}



        protected void InitCudaModule()
        {
            cuda = new CUDA(0, true);
            cuModule = cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, cudaModuleName));
            cuFuncDotProd = cuda.GetModuleFunction(cudaProductKernelName);
            cuFuncSolver = cuda.GetModuleFunction(cudaSolveL2SVM);
            cuFuncUpdateW = cuda.GetModuleFunction(cudaUpdateW);
        }
Example #17
0
 public static extern CUResult cuModuleGetFunction(ref CUfunction hfunc, CUmodule hmod, string name);
Example #18
0
        public void Initialize()
        {
            float softeningSquared = 0.00125f;
            Random random = new Random();

            m_CUDA.LoadModule(m_nbody_kernel);
            m_IntegrateBodies = m_CUDA.GetModuleFunction("IntegrateBodies");
            m_SofteningSquared = m_CUDA.GetModuleGlobal("softeningSquared");

            h_Pos = new Float4[2][] { new Float4[m_NumBodies], new Float4[m_NumBodies] };
            h_Vel = new Float4[2][] { new Float4[m_NumBodies], new Float4[m_NumBodies] };
            d_Pos = new CUdeviceptr[2] { m_CUDA.Allocate<Float4>(HostOldPos), m_CUDA.Allocate<Float4>(HostNewPos) };
            d_Vel = new CUdeviceptr[2] { m_CUDA.Allocate<Float4>(HostOldVel), m_CUDA.Allocate<Float4>(HostNewVel) };

            float scale = 3.0f;
            float vscale = scale * 1.0f;

            for (int i = 0; i < HostOldPos.Length; i++)
            {
            recalc:
                HostOldPos[i].x = (float)(random.NextDouble() * 2 - 1.0);
                HostOldPos[i].y = (float)(random.NextDouble() * 2 - 1.0);
                HostOldPos[i].z = (float)(random.NextDouble() * 2 - 1.0);
                HostOldPos[i].w = 1.0f;

                if (dot(HostOldPos[i], HostOldPos[i]) > 1.0f)
                    goto recalc;

                HostOldPos[i].x *= scale;
                HostOldPos[i].y *= scale;
                HostOldPos[i].z *= scale;
            }

            for (int i = 0; i < HostOldVel.Length; i++)
            {
            recalc:
                HostOldVel[i].x = (float)(random.NextDouble() * 2 - 1.0);
                HostOldVel[i].y = (float)(random.NextDouble() * 2 - 1.0);
                HostOldVel[i].z = (float)(random.NextDouble() * 2 - 1.0);
                HostOldVel[i].w = 1.0f;
                if (dot(HostOldVel[i], HostOldVel[i]) > 1.0f)
                    goto recalc;

                HostOldPos[i].x *= vscale;
                HostOldPos[i].y *= vscale;
                HostOldPos[i].z *= vscale;
            }

            m_CUDA.CopyHostToDevice<Float4>(DeviceOldPos, HostOldPos);
            m_CUDA.CopyHostToDevice<Float4>(DeviceOldVel, HostOldVel);
            m_CUDA.CopyHostToDevice<float>(m_SofteningSquared, new float[] { softeningSquared });
        }
Example #19
0
 public static extern CUResult cuParamSetf(CUfunction hfunc, int offset, float value);
Example #20
0
 public static extern CUResult cuParamSeti(CUfunction hfunc, int offset, uint value);
Example #21
0
 public static extern CUResult cuParamSetSize(CUfunction hfunc, uint numbytes);
Example #22
0
        private void InitCudaModule()
        {
            cuda = new CUDA(0, true);
            cuModule = cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, cudaModuleName));

            cuFuncDotProd = cuda.GetModuleFunction(cudaProductKernelName);

            cuFuncGradFinalize = cuda.GetModuleFunction(cudaGradFinalizeName);

            cuFuncComputeBBstep = cuda.GetModuleFunction(cudaComputeBBStepName);

            cuFuncObjSquareW = cuda.GetModuleFunction(cudaObjWName);
            cuFuncObjSquareAlpha = cuda.GetModuleFunction(cudaObjAlphaName);

            cuFuncUpdateW = cuda.GetModuleFunction(cudaUpdateW);

            cuFuncUpdateAlpha = cuda.GetModuleFunction(cudaUpdateAlphaName);

            cuFuncMaxNorm = cuda.GetModuleFunction(cudaMaxNormName);
        }
		unsafe public FlaCudaTask(CUDA _cuda, int channelCount, int channels, uint bits_per_sample, int max_frame_size, bool do_verify)
		{
			cuda = _cuda;

			residualTasksLen = sizeof(FlaCudaSubframeTask) * channelCount * (lpc.MAX_LPC_ORDER * lpc.MAX_LPC_WINDOWS + 8) * FlaCudaWriter.maxFrames;
			bestResidualTasksLen = sizeof(FlaCudaSubframeTask) * channelCount * FlaCudaWriter.maxFrames;
			samplesBufferLen = sizeof(int) * FlaCudaWriter.MAX_BLOCKSIZE * channelCount;
			int partitionsLen = sizeof(int) * (30 << 8) * channelCount * FlaCudaWriter.maxFrames;
			int riceParamsLen = sizeof(int) * (4 << 8) * channelCount * FlaCudaWriter.maxFrames;
			int lpcDataLen = sizeof(float) * 32 * 33 * lpc.MAX_LPC_WINDOWS * channelCount * FlaCudaWriter.maxFrames;

			cudaSamplesBytes = cuda.Allocate((uint)samplesBufferLen / 2);
			cudaSamples = cuda.Allocate((uint)samplesBufferLen);
			cudaResidual = cuda.Allocate((uint)samplesBufferLen);
			cudaLPCData = cuda.Allocate((uint)lpcDataLen);
			cudaPartitions = cuda.Allocate((uint)partitionsLen);
			cudaRiceParams = cuda.Allocate((uint)riceParamsLen);
			cudaBestRiceParams = cuda.Allocate((uint)riceParamsLen / 4);
			cudaAutocorOutput = cuda.Allocate((uint)(sizeof(float) * channelCount * lpc.MAX_LPC_WINDOWS * (lpc.MAX_LPC_ORDER + 1) * (FlaCudaWriter.maxAutocorParts + FlaCudaWriter.maxFrames)));
			cudaResidualTasks = cuda.Allocate((uint)residualTasksLen);
			cudaBestResidualTasks = cuda.Allocate((uint)bestResidualTasksLen);
			cudaResidualOutput = cuda.Allocate((uint)(sizeof(int) * channelCount * (lpc.MAX_LPC_WINDOWS * lpc.MAX_LPC_ORDER + 8) * 64 /*FlaCudaWriter.maxResidualParts*/ * FlaCudaWriter.maxFrames));
			CUResult cuErr = CUResult.Success;
			if (cuErr == CUResult.Success)
				cuErr = CUDADriver.cuMemAllocHost(ref samplesBytesPtr, (uint)samplesBufferLen/2);
			if (cuErr == CUResult.Success)
				cuErr = CUDADriver.cuMemAllocHost(ref residualBufferPtr, (uint)samplesBufferLen);
			if (cuErr == CUResult.Success)
				cuErr = CUDADriver.cuMemAllocHost(ref bestRiceParamsPtr, (uint)riceParamsLen / 4);
			if (cuErr == CUResult.Success)
				cuErr = CUDADriver.cuMemAllocHost(ref residualTasksPtr, (uint)residualTasksLen);
			if (cuErr == CUResult.Success)
				cuErr = CUDADriver.cuMemAllocHost(ref bestResidualTasksPtr, (uint)bestResidualTasksLen);
			if (cuErr != CUResult.Success)
			{
				if (samplesBytesPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(samplesBytesPtr); samplesBytesPtr = IntPtr.Zero;
				if (residualBufferPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(residualBufferPtr); residualBufferPtr = IntPtr.Zero;
				if (bestRiceParamsPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(bestRiceParamsPtr); bestRiceParamsPtr = IntPtr.Zero;
				if (residualTasksPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(residualTasksPtr); residualTasksPtr = IntPtr.Zero;
				if (bestResidualTasksPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(bestResidualTasksPtr); bestResidualTasksPtr = IntPtr.Zero;
				throw new CUDAException(cuErr);
			}

			cudaComputeAutocor = cuda.GetModuleFunction("cudaComputeAutocor");
			cudaStereoDecorr = cuda.GetModuleFunction("cudaStereoDecorr");
			cudaChannelDecorr = cuda.GetModuleFunction("cudaChannelDecorr");
			cudaChannelDecorr2 = cuda.GetModuleFunction("cudaChannelDecorr2");
			cudaFindWastedBits = cuda.GetModuleFunction("cudaFindWastedBits");
			cudaComputeLPC = cuda.GetModuleFunction("cudaComputeLPC");
			cudaQuantizeLPC = cuda.GetModuleFunction("cudaQuantizeLPC");
			cudaComputeLPCLattice = cuda.GetModuleFunction("cudaComputeLPCLattice");
			cudaEstimateResidual = cuda.GetModuleFunction("cudaEstimateResidual");
			cudaEstimateResidual8 = cuda.GetModuleFunction("cudaEstimateResidual8");
			cudaEstimateResidual12 = cuda.GetModuleFunction("cudaEstimateResidual12");
			cudaEstimateResidual1 = cuda.GetModuleFunction("cudaEstimateResidual1");
			cudaChooseBestMethod = cuda.GetModuleFunction("cudaChooseBestMethod");
			cudaCopyBestMethod = cuda.GetModuleFunction("cudaCopyBestMethod");
			cudaCopyBestMethodStereo = cuda.GetModuleFunction("cudaCopyBestMethodStereo");
			cudaEncodeResidual = cuda.GetModuleFunction("cudaEncodeResidual");
			cudaCalcPartition = cuda.GetModuleFunction("cudaCalcPartition");
			cudaCalcPartition16 = cuda.GetModuleFunction("cudaCalcPartition16");
			cudaCalcLargePartition = cuda.GetModuleFunction("cudaCalcLargePartition");
			cudaSumPartition = cuda.GetModuleFunction("cudaSumPartition");
			cudaFindRiceParameter = cuda.GetModuleFunction("cudaFindRiceParameter");
			cudaFindPartitionOrder = cuda.GetModuleFunction("cudaFindPartitionOrder");

			stream = cuda.CreateStream();
			samplesBuffer = new int[FlaCudaWriter.MAX_BLOCKSIZE * channelCount];
			outputBuffer = new byte[max_frame_size * FlaCudaWriter.maxFrames + 1];
			frame = new FlacFrame(channelCount);
			frame.writer = new BitWriter(outputBuffer, 0, outputBuffer.Length);

			if (do_verify)
			{
				verify = new FlakeReader(new AudioPCMConfig((int)bits_per_sample, channels, 44100));
				verify.DoCRC = false;
			}
		}