Example #1
0
        public void SetVector <T>(T[] data, int incx, CUdeviceptr ptr, int incy)
        {
            GCHandle handle = GCHandle.Alloc(data, GCHandleType.Pinned);

            this.LastError = CUBLASDriver.cublasSetVector(data.Length, CUDA.MSizeOf(typeof(T)), handle.AddrOfPinnedObject(), incx, ptr, incy);
            handle.Free();
        }
Example #2
0
    static void Main()
    {
        cuda   = new CUDA(true);
        cublas = new CUBLAS(cuda);
        //allocate vector on cuda device in main thread
        CudaManager.CallMethod(AllocateVectors);
        //changing first vector from other thread
        Thread changeThread = new Thread(ChangeVectorOnDevice_ThreadRun)
        {
            IsBackground = false
        };

        changeThread.Start();
        //wait for changeThread to finish
        autoResetEvent.WaitOne();
        //getting vector from device in another one thread
        Thread getThread = new Thread(GetVectorFromDevice_ThreadRun)
        {
            IsBackground = false
        };

        getThread.Start();
        //wait for getThread to finish
        autoResetEvent.WaitOne();
        Console.WriteLine("({0}, {1}, {2}, {3}, {4})", vector2[0], vector2[1], vector2[2], vector2[3], vector2[4]);
        Console.ReadKey(true);
    }
Example #3
0
        public CUdeviceptr Allocate <T>(T[] array)
        {
            CUdeviceptr devicePtr = new CUdeviceptr();

            this.LastError = CUBLASDriver.cublasAlloc(array.Length, CUDA.MSizeOf(typeof(T)), ref devicePtr);
            return(devicePtr);
        }
Example #4
0
        private static int[] CalculateElementsFrequencies <T>(IList <T> elements, BitmapWrapper bitmapWrapper)
        {
            int frequencesSize      = ProjectConstants.BlockSize * ProjectConstants.GridSize;
            var elementsFrequencies = new int[frequencesSize];

            using (var cuda = new CUDA(0, true))
            {
                var path = Path.Combine(Path.GetDirectoryName(Assembly.GetExecutingAssembly().Location),
                                        "apriori_count1.cubin");

                cuda.LoadModule(path);

                var inputData    = cuda.CopyHostToDevice(bitmapWrapper.RgbValues);
                var inputSetData = cuda.CopyHostToDevice(elements.ToArray());

                var answer = cuda.Allocate(new int[frequencesSize]);


                CallTheFrequencyCount(cuda, inputData, inputSetData, answer, bitmapWrapper, elements.Count);
                cuda.CopyDeviceToHost(answer, elementsFrequencies);

                cuda.Free(inputData);
                cuda.Free(answer);
                cuda.UnloadModule();
            }
            return(elementsFrequencies);
        }
Example #5
0
 public NBody(CUDA CUDA, float DeltaTime, float Damping, int NumBodies)
 {
     m_CUDA = CUDA;
     m_DeltaTime = DeltaTime;
     m_Damping = Damping;
     m_NumBodies = NumBodies;
 }
Example #6
0
        //public void Create()
        //{
        //    this.LastError = CUBLASDriver.cublasCreate_v2();
        //}

        public void SetMatrix <T>(int rows, int cols, T[] data, int lda, CUdeviceptr ptr, int ldb)
        {
            GCHandle handle = GCHandle.Alloc(data, GCHandleType.Pinned);

            this.LastError = CUBLASDriver.cublasSetMatrix(rows, cols, CUDA.MSizeOf(typeof(T)), handle.AddrOfPinnedObject(), lda, ptr, ldb);
            handle.Free();
        }
Example #7
0
        //private double ComputeObj(float[] w, float[] alpha, Problem<SparseVec> sub_prob, float[] diag)
        //{
        //    double v = 0, v1=0;
        //    int nSV = 0;
        //    for (int i = 0; i < w.Length; i++)
        //    {
        //        v += w[i] * w[i];
        //        v1 += 0.5*w[i] * w[i];
        //    }
        //    for (int i = 0; i < alpha.Length; i++)
        //    {
        //        sbyte y_i = (sbyte)sub_prob.Y[i];

        //        //original line
        //        //v += alpha[i] * (alpha[i] * diag[GETI(y_i, i)] - 2);
        //        v += alpha[i] * (alpha[i] * diag[y_i + 1] - 2);
        //        v1 += 0.5* alpha[i] * (alpha[i] * diag[y_i + 1] - 2);
        //        if (alpha[i] > 0) ++nSV;
        //    }

        //    v = v / 2;
        //  //  Debug.WriteLine("Objective value = {0}", v);
        //  //  Debug.WriteLine("nSV = {0}", nSV);

        //    return v;
        //}



        protected void InitCudaModule()
        {
            cuda          = new CUDA(0, true);
            cuModule      = cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, cudaModuleName));
            cuFuncDotProd = cuda.GetModuleFunction(cudaProductKernelName);
            cuFuncSolver  = cuda.GetModuleFunction(cudaSolveL2SVM);
            cuFuncUpdateW = cuda.GetModuleFunction(cudaUpdateW);
        }
Example #8
0
 public static void CudaStop()
 {
     if (_cuda != null)
     {
         _cudaTest.Dispose();
         _cuda.Dispose(); _cuda = null;
     }
 }
Example #9
0
 protected virtual void CallTheFrequencyCount(CUDA cuda, CUdeviceptr deviceInput, CUdeviceptr deviceOutput, BitmapWrapper wrapper)
 {
     new CudaFunctionCall(cuda, Names.CountFrequency)
     .AddParameter(deviceInput)
     .AddParameter(deviceOutput)
     .AddParameter((uint)wrapper.Width)
     .AddParameter((uint)wrapper.Height)
     .Execute(ProjectConstants.BlockSize, ProjectConstants.BlockSize, 1, ProjectConstants.GridSize, ProjectConstants.GridSize);
 }
Example #10
0
        public override void Init()
        {
            cuda = new CUDA(0, true);

            var cuCtx = cuda.CreateContext(0, CUCtxFlags.MapHost);

            cuda.SetCurrentContext(cuCtx);

            cuModule = cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, cudaModuleName));
            cuFunc   = cuda.GetModuleFunction(cudaEvaluatorKernelName);

            cuFuncSign = cuda.GetModuleFunction(cudaSignKernelName);

            //reserved memory based on dimension of support vector
            //svVector = new float[TrainedModel.SupportElements[0].Count];

            stream = cuda.CreateStream();
            //memSvSize = (uint)(TrainedModel.SupportElements[0].Count * sizeof(float));
            memSvSize = (uint)(TrainedModel.SupportElements[0].Dim * sizeof(float));

            //allocates memory for buffers
            svVecIntPtrs[0] = cuda.AllocateHost(memSvSize);
            svVecIntPtrs[1] = cuda.AllocateHost(memSvSize);
            mainVecPtr      = cuda.CopyHostToDeviceAsync(svVecIntPtrs[0], memSvSize, stream);

            cuSVTexRef = cuda.GetModuleTexture(cuModule, "svTexRef");
            cuda.SetTextureFlags(cuSVTexRef, 0);
            cuda.SetTextureAddress(cuSVTexRef, mainVecPtr, memSvSize);

            //todo: copy labels and alphas

            float[] svLabels = new float[TrainedModel.SupportElements.Length];
            float[] svAlphas = new float[TrainedModel.SupportElements.Length];


            Parallel.For(0, TrainedModel.SupportElementsIndexes.Length,
                         i => {
                int idx = TrainedModel.SupportElementsIndexes[i];

                svLabels[i] = TrainedModel.Y[i];
                //svLabels[i] = TrainningProblem.Labels[idx];
                svAlphas[i] = TrainedModel.Alpha[idx];
            });

            //for (int i = 0; i < TrainedModel.SupportElementsIndexes.Length; i++)
            //{
            //    int idx = TrainedModel.SupportElementsIndexes[i];
            //    svLabels[i]= TrainningProblem.Labels[idx];
            //    svAlphas[i] = TrainedModel.Alpha[idx];

            //}

            labelsPtr = cuda.CopyHostToDevice(svLabels);
            alphasPtr = cuda.CopyHostToDevice(svAlphas);

            IsInitialized = true;
        }
Example #11
0
 public CudaTest(CUDA cuda)
 {
     _cuda = cuda;
     //+ load frame module
     LoadFrameModule(Cuda.CudaModle.BuiltinModules.Test);
     //+ create cuda event handles
     _cudaStartEvent = _cuda.CreateEvent();
     _cudaStopEvent  = _cuda.CreateEvent();
 }
Example #12
0
        /// <summary>
        /// Allocates array on device.
        /// </summary>
        /// <typeparam name="T">Float, Double ComplexF or ComplexD.</typeparam>
        /// <param name="rows">The x dimension.</param>
        /// <param name="columns">The y dimension.</param>
        /// <returns>2D matrix.</returns>
        public override T[,] Allocate <T>(int rows, int columns)
        {
            T[,] devMem = new T[0, 0];
            CUdeviceptr ptr = new CUdeviceptr();

            HandleError(CUDARuntime.cudaMalloc(ref ptr, rows * columns * CUDA.MSizeOf(typeof(T))));
            _deviceMemory.Add(devMem, new CUDevicePtrEx(ptr, rows, columns, null));
            return(devMem);
        }
Example #13
0
        /// <summary>
        /// Allocates vector on device.
        /// </summary>
        /// <typeparam name="T">Float, Double ComplexF or ComplexD.</typeparam>
        /// <param name="x">Length of 1D array.</param>
        /// <returns>1D device array.</returns>
        public override T[] Allocate <T>(int x)
        {
            T[]         devMem = new T[0];
            CUdeviceptr ptr    = new CUdeviceptr();

            HandleError(CUDARuntime.cudaMalloc(ref ptr, x * CUDA.MSizeOf(typeof(T))));
            _deviceMemory.Add(devMem, new CUDevicePtrEx(ptr, x, null));
            return(devMem);
        }
Example #14
0
        // It's interesting to change the number of blocks and the number of threads to
        // understand how to keep the hardware busy.
        //
        // Here are some numbers I get on my G80:
        //    blocks - clocks
        //    1 - 3096
        //    8 - 3232
        //    16 - 3364
        //    32 - 4615
        //    64 - 9981
        //
        // With less than 16 blocks some of the multiprocessors of the device are idle. With
        // more than 16 you are using all the multiprocessors, but there's only one block per
        // multiprocessor and that doesn't allow you to hide the latency of the memory. With
        // more than 32 the speed scales linearly.
        static void Main(string[] args)
        {
            // Init CUDA, select 1st device.
            CUDA cuda = new CUDA(0, true);

            // load module
            cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, "clock_kernel.cubin"));
            CUfunction func = cuda.GetModuleFunction("timedReduction");

            int[]   timer = new int[NUM_BLOCKS * 2];
            float[] input = new float[NUM_THREADS * 2];

            for (int i = 0; i < NUM_THREADS * 2; i++)
            {
                input[i] = (float)i;
            }

            CUdeviceptr dinput  = cuda.CopyHostToDevice <float>(input);
            CUdeviceptr doutput = cuda.Allocate((uint)(sizeof(float) * NUM_BLOCKS));
            CUdeviceptr dtimer  = cuda.Allocate <int>(timer);

            cuda.SetParameter(func, 0, (uint)dinput.Pointer);
            cuda.SetParameter(func, IntPtr.Size, (uint)doutput.Pointer);
            cuda.SetParameter(func, IntPtr.Size * 2, (uint)dtimer.Pointer);
            cuda.SetParameterSize(func, (uint)(IntPtr.Size * 3));

            //timedReduction<<<NUM_BLOCKS, NUM_THREADS, sizeof(float) * 2 * NUM_THREADS>>>(dinput, doutput, dtimer);
            cuda.SetFunctionBlockShape(func, NUM_THREADS, 1, 1);
            cuda.SetFunctionSharedSize(func, (uint)(sizeof(float) * 2 * NUM_THREADS));
            cuda.Launch(func, NUM_BLOCKS, 1);

            cuda.CopyDeviceToHost <int>(dtimer, timer);

            cuda.Free(dinput);
            cuda.Free(doutput);
            cuda.Free(dtimer);

            foreach (int i in timer)
            {
                Console.WriteLine(i);
            }

            Console.WriteLine("Test PASSED");

            int minStart = timer[0];
            int maxEnd   = timer[NUM_BLOCKS];

            for (int i = 1; i < NUM_BLOCKS; i++)
            {
                minStart = timer[i] < minStart ? timer[i] : minStart;
                maxEnd   = timer[NUM_BLOCKS + i] > maxEnd ? timer[NUM_BLOCKS + i] : maxEnd;
            }

            Console.WriteLine("time = {0}", maxEnd - minStart);
        }
Example #15
0
        // It's interesting to change the number of blocks and the number of threads to 
        // understand how to keep the hardware busy.
        //
        // Here are some numbers I get on my G80:
        //    blocks - clocks
        //    1 - 3096
        //    8 - 3232
        //    16 - 3364
        //    32 - 4615
        //    64 - 9981
        //
        // With less than 16 blocks some of the multiprocessors of the device are idle. With
        // more than 16 you are using all the multiprocessors, but there's only one block per
        // multiprocessor and that doesn't allow you to hide the latency of the memory. With
        // more than 32 the speed scales linearly.
        static void Main(string[] args)
        {
            // Init CUDA, select 1st device.
            CUDA cuda = new CUDA(0, true);

            // load module
            cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, "clock_kernel.cubin"));
            CUfunction func = cuda.GetModuleFunction("timedReduction");

            int[] timer = new int[NUM_BLOCKS * 2];
            float[] input = new float[NUM_THREADS * 2];

            for (int i = 0; i < NUM_THREADS * 2; i++)
            {
                input[i] = (float)i;
            }

            CUdeviceptr dinput = cuda.CopyHostToDevice<float>(input);
            CUdeviceptr doutput = cuda.Allocate((uint)(sizeof(float) * NUM_BLOCKS));
            CUdeviceptr dtimer = cuda.Allocate<int>(timer);

            cuda.SetParameter(func, 0, (uint)dinput.Pointer);
            cuda.SetParameter(func, IntPtr.Size, (uint)doutput.Pointer);
            cuda.SetParameter(func, IntPtr.Size*2, (uint)dtimer.Pointer);
            cuda.SetParameterSize(func, (uint)(IntPtr.Size*3));

            //timedReduction<<<NUM_BLOCKS, NUM_THREADS, sizeof(float) * 2 * NUM_THREADS>>>(dinput, doutput, dtimer);
            cuda.SetFunctionBlockShape(func, NUM_THREADS, 1, 1);
            cuda.SetFunctionSharedSize(func, (uint)(sizeof(float) * 2 * NUM_THREADS));
            cuda.Launch(func, NUM_BLOCKS, 1);

            cuda.CopyDeviceToHost<int>(dtimer, timer);

            cuda.Free(dinput);
            cuda.Free(doutput);
            cuda.Free(dtimer);

            foreach (int i in timer)
            {
                Console.WriteLine(i);
            }

            Console.WriteLine("Test PASSED");

            int minStart = timer[0];
            int maxEnd = timer[NUM_BLOCKS];
            for (int i = 1; i < NUM_BLOCKS; i++)
            {
                minStart = timer[i] < minStart ? timer[i] : minStart;
                maxEnd = timer[NUM_BLOCKS + i] > maxEnd ? timer[NUM_BLOCKS + i] : maxEnd;
            }

            Console.WriteLine("time = {0}", maxEnd - minStart);
        }
Example #16
0
 private static void CallTheFrequencyCount(CUDA cuda, CUdeviceptr deviceInput, CUdeviceptr deviceInputSet, CUdeviceptr deviceOutput, BitmapWrapper wrapper, int setSize)
 {
     new CudaFunctionCall(cuda, Names.CountSetsFrequencies)
     .AddParameter(deviceInput)
     .AddParameter(deviceInputSet)
     .AddParameter(deviceOutput)
     .AddParameter((uint)wrapper.Width)
     .AddParameter((uint)wrapper.Height)
     .AddParameter((uint)setSize)
     .Execute(ProjectConstants.BlockSize, ProjectConstants.BlockSize, 1, ProjectConstants.GridSize, ProjectConstants.GridSize);
 }
Example #17
0
        public FormGpuList()
        {
            InitializeComponent();

            mCuda = new CUDA(true);
            for (int i = 0; i < mCuda.Devices.Length; i++)
            {
                lbGpuNames.Items.Add(mCuda.Devices[i].Name);
            }
            lbGpuNames.SelectedIndex = 0;
        }
        public FormGpuList()
        {
            InitializeComponent();

            mCuda = new CUDA(true);
            for (int i = 0; i < mCuda.Devices.Length; i++)
            {
                lbGpuNames.Items.Add(mCuda.Devices[i].Name);
            }
            lbGpuNames.SelectedIndex = 0;
        }
Example #19
0
        /// <summary>
        /// Gets the device properties.
        /// </summary>
        /// <param name="type">The type of GPU.</param>
        /// <param name="useAdvanced">Whether to get the additional device settings via the cudart dll.</param>
        /// <returns>Device properties for all devices of the specified type.</returns>
        public static IEnumerable <GPGPUProperties> GetDeviceProperties(eGPUType type, bool useAdvanced = true)
        {
            if (type == eGPUType.Emulator)
            {
                foreach (var kvp in GPGPUs.Where(g => g.Value is EmulatedGPU))
                {
                    yield return(kvp.Value.GetDeviceProperties(useAdvanced));
                }
            }
            else if (type == eGPUType.Cuda)
            {
                // Store the current context
                CUcontext?ctx        = CUDA.TryGetCurrentContext();
                GPGPU     currentGPU = null;
                int       devCnt     = CudaGPU.GetDeviceCount();
                for (int i = 0; i < devCnt; i++)
                {
                    CudaGPU         gpu = null;
                    GPGPUProperties props;

                    gpu = (CudaGPU)GetDevice(eGPUType.Cuda, i);
                    if (gpu == null)
                    {
                        throw new CudafyHostException(CudafyHostException.csDEVICE_X_NOT_FOUND, string.Format("{0}{1}", eGPUType.Cuda.ToString(), i));
                    }
                    props = gpu.GetDeviceProperties(useAdvanced);
                    if (ctx != null && gpu.GetDeviceContext().Pointer == ctx.Value.Pointer)
                    {
                        currentGPU = gpu;
                    }
                    yield return(props);
                }
                // Reset context to current GPU
                if (ctx != null && currentGPU != null)
                {
                    currentGPU.SetCurrentContext();
                }
            }
            else if (type == eGPUType.OpenCL)
            {
                int deviceId = 0;
                foreach (ComputeDevice computeDevice in OpenCLDevice.ComputeDevices)
                {
                    yield return(OpenCLDevice.GetDeviceProperties(computeDevice, deviceId++));
                }
            }
            else
            {
                throw new CudafyHostException(CudafyHostException.csX_NOT_CURRENTLY_SUPPORTED, type);
            }
        }
Example #20
0
 public void Dispose()
 {
     if (_cuda != null)
     {
         //if (_hasAllocatedFrame == true)
         //{
         //    FreeAllocatedFrame();
         //}
         //+
         _cuda.DestroyEvent(_cudaStartEvent);
         _cuda.DestroyEvent(_cudaStopEvent);
         _cuda = null;
     }
 }
Example #21
0
        /// <summary>
        /// Dispose all object used by CUDA
        /// </summary>
        private void DisposeCuda()
        {
            if (cuda != null)
            {
                //free all resources
                cuda.Free(valsCSRPtr);
                cuda.Free(valsCSCPtr);
                valsCSRPtr.Pointer = IntPtr.Zero;
                valsCSCPtr.Pointer = IntPtr.Zero;

                cuda.Free(idxCSRPtr);
                cuda.Free(idxCSCPtr);
                idxCSRPtr.Pointer = IntPtr.Zero;
                idxCSCPtr.Pointer = IntPtr.Zero;

                cuda.Free(vecLenghtCSRPtr);
                cuda.Free(vecLenghtCSCPtr);
                vecLenghtCSRPtr.Pointer = IntPtr.Zero;
                vecLenghtCSCPtr.Pointer = IntPtr.Zero;



                cuda.Free(qdPtr);
                qdPtr.Pointer = IntPtr.Zero;
                //  cuda.Free(diagPtr);
                diagPtr.Pointer = IntPtr.Zero;
                cuda.Free(alphaPtr);
                alphaPtr.Pointer = IntPtr.Zero;
                cuda.Free(gradPtr);
                gradPtr.Pointer = IntPtr.Zero;

                cuda.Free(deltasPtr);
                deltasPtr.Pointer = IntPtr.Zero;
                cuda.DestroyTexture(cuDeltasTexRef);

                cuda.Free(labelsPtr);
                labelsPtr.Pointer = IntPtr.Zero;
                cuda.DestroyTexture(cuLabelsTexRef);

                cuda.Free(mainVecPtr);
                mainVecPtr.Pointer = IntPtr.Zero;

                cuda.DestroyTexture(cuMainVecTexRef);

                cuda.UnloadModule(cuModule);
                cuda.Dispose();
                cuda = null;
            }
        }
        public EllpackDenseVectorBuilder(CUDA cu, CUdeviceptr vector, CUdeviceptr vals, CUdeviceptr cols, CUdeviceptr length, int rows, int dim)
        {
            cuda         = cu;
            vecPtr       = vector;
            valsPtr      = vals;
            idxPtr       = cols;
            vecLengthPtr = length;
            nrRows       = (uint)rows;
            vecDim       = (uint)dim;

            blocksPerGrid = (int)Math.Ceiling((vecDim + 0.0) / threadsPerBlock);

            var blocksPerGrid1 = (vecDim + threadsPerBlock - 1) / threadsPerBlock;

            Debug.Assert(blocksPerGrid == blocksPerGrid1);
        }
Example #23
0
        //
        // A sorting network is a sorting algorith, where the sequence of comparisons
        // is not data-dependent. That makes them suitable for parallel implementations.
        //
        // Bitonic sort is one of the fastest sorting networks, consisting of o(n log^2 n)
        // comparators. It has a simple implemention and it's very efficient when sorting
        // a small number of elements:
        //
        // http://citeseer.ist.psu.edu/blelloch98experimental.html
        //
        // This implementation is based on:
        //
        // http://www.tools-of-computing.com/tc/CS/Sorts/bitonic_sort.htm
        //
        static void Main(string[] args)
        {
            const int NUM = 256;

            // Init CUDA, select 1st device.
            CUDA cuda = new CUDA(0, true);

            // create values
            int[]  values = new int[NUM];
            Random rand   = new Random();

            for (int i = 0; i < NUM; i++)
            {
                values[i] = rand.Next();
            }

            // allocate memory and copy to device
            CUdeviceptr dvalues = cuda.CopyHostToDevice <int>(values);

            // load module
            cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, "bitonic.cubin"));
            CUfunction func = cuda.GetModuleFunction("bitonicSort");

            cuda.SetParameter(func, 0, (uint)dvalues.Pointer);
            cuda.SetParameterSize(func, (uint)IntPtr.Size);

            //bitonicSort<<<1, NUM, sizeof(int) * NUM>>>(dvalues);
            cuda.SetFunctionBlockShape(func, NUM, 1, 1);
            cuda.SetFunctionSharedSize(func, sizeof(int) * NUM);
            cuda.Launch(func, 1, 1);

            cuda.CopyDeviceToHost <int>(dvalues, values);
            cuda.Free(dvalues);

            bool passed = true;

            for (int i = 1; i < NUM; i++)
            {
                if (values[i - 1] > values[i])
                {
                    passed = false;
                    break;
                }
            }

            Console.WriteLine("Test {0}", passed ? "PASSED" : "FAILED");
        }
        public EllpackDenseVectorBuilder(CUDA cu,CUdeviceptr vector, CUdeviceptr vals,CUdeviceptr cols,CUdeviceptr length,int rows,int dim)
        {
            cuda = cu;
            vecPtr = vector;
            valsPtr = vals;
            idxPtr = cols;
            vecLengthPtr = length;
            nrRows = (uint)rows;
            vecDim = (uint)dim;

            blocksPerGrid = (int) Math.Ceiling( (vecDim + 0.0) / threadsPerBlock);

            var blocksPerGrid1 = (vecDim + threadsPerBlock - 1) / threadsPerBlock;

            Debug.Assert(blocksPerGrid == blocksPerGrid1);

        }
Example #25
0
        private void InitCuda()
        {
            cuda = new CUDA(0, true);

            var cuCtx = cuda.CreateContext(0, CUCtxFlags.MapHost);

            cuda.SetCurrentContext(cuCtx);



            cuModule = cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, cudaModuleName));

            cuFuncEval = cuda.GetModuleFunction(cudaEvaluatorKernelName);


            cuFuncReduce = cuda.GetModuleFunction(cudaReduceKernelName);
        }
Example #26
0
        private void CopyFromDevice <T>(Array devArray, int devOffset, Array hostArray, int hostOffset, int count)
        {
            CUDevicePtrEx devPtrEx = GetDeviceMemory(devArray) as CUDevicePtrEx;
            int           n        = hostArray.Length;
            Type          type     = typeof(T);
            int           elemSize = CUDA.MSizeOf(type);

            unsafe
            {
                GCHandle    handle  = GCHandle.Alloc(hostArray, GCHandleType.Pinned);
                IntPtr      hostPtr = new IntPtr(handle.AddrOfPinnedObject().ToInt64() + hostOffset * elemSize);
                CUdeviceptr devPtr  = devPtrEx.DevPtr + devOffset * elemSize;
                cudaError   rc      = CUDARuntime.cudaMemcpy(hostPtr, devPtr, elemSize * n, cudaMemcpyKind.cudaMemcpyDeviceToHost);
                handle.Free();
                HandleError(rc);
            }
        }
Example #27
0
        //
        // A sorting network is a sorting algorith, where the sequence of comparisons
        // is not data-dependent. That makes them suitable for parallel implementations.
        //
        // Bitonic sort is one of the fastest sorting networks, consisting of o(n log^2 n)
        // comparators. It has a simple implemention and it's very efficient when sorting 
        // a small number of elements:
        //
        // http://citeseer.ist.psu.edu/blelloch98experimental.html
        //
        // This implementation is based on:
        //
        // http://www.tools-of-computing.com/tc/CS/Sorts/bitonic_sort.htm
        //
        static void Main(string[] args)
        {
            const int NUM = 256;

            // Init CUDA, select 1st device.
            CUDA cuda = new CUDA(0, true);

            // create values
            int[] values = new int[NUM];
            Random rand = new Random();
            for (int i = 0; i < NUM; i++)
            {
                values[i] = rand.Next();
            }

            // allocate memory and copy to device
            CUdeviceptr dvalues = cuda.CopyHostToDevice<int>(values);

            // load module
            cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, "bitonic.cubin"));
            CUfunction func = cuda.GetModuleFunction("bitonicSort");

            cuda.SetParameter(func, 0, (uint)dvalues.Pointer);
            cuda.SetParameterSize(func, (uint)IntPtr.Size);

            //bitonicSort<<<1, NUM, sizeof(int) * NUM>>>(dvalues);
            cuda.SetFunctionBlockShape(func, NUM, 1, 1);
            cuda.SetFunctionSharedSize(func, sizeof(int) * NUM);
            cuda.Launch(func, 1, 1);

            cuda.CopyDeviceToHost<int>(dvalues, values);
            cuda.Free(dvalues);

            bool passed = true;
            for (int i = 1; i < NUM; i++)
            {
                if (values[i - 1] > values[i])
                {
                    passed = false;
                    break;
                }
            }

            Console.WriteLine("Test {0}", passed ? "PASSED" : "FAILED");
        }
Example #28
0
        protected void InitCudaModule()
        {
            int deviceNr = 0;

            cuda  = new CUDA(deviceNr, true);
            cuCtx = cuda.CreateContext(deviceNr, CUCtxFlags.MapHost);


            string modluePath = Path.Combine(Environment.CurrentDirectory, cudaModuleName);

            if (!File.Exists(modluePath))
            {
                throw new ArgumentException("Failed access to cuda module" + modluePath);
            }

            cuModule = cuda.LoadModule(modluePath);
            cuFunc   = cuda.GetModuleFunction(cudaProductKernelName);
        }
Example #29
0
 public static void CudaStart()
 {
     //+ Create a new instance of CUDA class, select 1st device. (create on worker thread)
     _cuda = new CUDA(0, true)
     {
         UseRuntimeExceptions = true,
     };
     if (_cuda != null)
     {
         Console.WriteLine("-- initializing CUDA");
         _cudaTest = new CudaTest(_cuda);
     }
     else
     {
         Console.WriteLine("-- CUDA not initalized");
         throw new Exception("CUDA not initalized");
     }
 }
        private void InitCudaModule()
        {
            cufy.CudafyModes.Target = cufy.eGPUType.Cuda;

            gpu   = CudafyHost.GetDevice(CudafyModes.Target);
            cuGPU = (CUDA)((CudaGPU)gpu).CudaDotNet;
            var ctx = cuGPU.CreateContext(0, CUCtxFlags.MapHost);

            cuGPU.SetCurrentContext(ctx);

            // gpu.EnableSmartCopy();

            module = CudafyModule.TryDeserialize(moduleName);
            if (module == null || !module.TryVerifyChecksums())
            {
                module = CudafyTranslator.Cudafy(typeof(CudafyRBFSlicedEllpackKernel));
                module.Serialize();
            }
            gpu.LoadModule(module);
        }
Example #31
0
        private void InitCudaModule()
        {
            cuda = gpuKernel.cuda;

            //cuda = new CUDA(0, true);
            //cuCtx = cuda.CreateContext(0, CUCtxFlags.MapHost);
            //cuda.SetCurrentContext(cuCtx);

            string modluePath = Path.Combine(Environment.CurrentDirectory, cudaModuleName);

            if (!File.Exists(modluePath))
            {
                throw new ArgumentException("Failed access to cuda module" + modluePath);
            }

            cuModule           = cuda.LoadModule(modluePath);
            cuFuncFindMaxIMinJ = cuda.GetModuleFunction(funcFindMaxIMinJ);

            cuFuncUpdateG = cuda.GetModuleFunction(funcUpdateGFunc);
        }
Example #32
0
        private void InitCudaModule()
        {
            cuda     = new CUDA(0, true);
            cuModule = cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, cudaModuleName));

            cuFuncDotProd = cuda.GetModuleFunction(cudaProductKernelName);

            cuFuncGradFinalize = cuda.GetModuleFunction(cudaGradFinalizeName);

            cuFuncComputeBBstep = cuda.GetModuleFunction(cudaComputeBBStepName);

            cuFuncObjSquareW     = cuda.GetModuleFunction(cudaObjWName);
            cuFuncObjSquareAlpha = cuda.GetModuleFunction(cudaObjAlphaName);

            cuFuncUpdateW = cuda.GetModuleFunction(cudaUpdateW);

            cuFuncUpdateAlpha = cuda.GetModuleFunction(cudaUpdateAlphaName);

            cuFuncLinPart = cuda.GetModuleFunction(cudaLinPartName);
        }
Example #33
0
        private int[] CalculateElementsFrequencies(BitmapWrapper bitmapWrapper)
        {
            var elementsFrequencies = new int[bitmapWrapper.Width];

            using (var cuda = new CUDA(0, true))
            {
                var path = Path.Combine(Path.GetDirectoryName(Assembly.GetExecutingAssembly().Location),
                                        Names.CudaAprioriCountModule);

                cuda.LoadModule(path);

                var inputData = cuda.CopyHostToDevice(bitmapWrapper.RgbValues);
                var answer    = cuda.Allocate(new int[bitmapWrapper.Width]);
                CallTheFrequencyCount(cuda, inputData, answer, bitmapWrapper);
                cuda.CopyDeviceToHost(answer, elementsFrequencies);

                cuda.Free(inputData);
                cuda.Free(answer);
                cuda.UnloadModule();
            }
            return(elementsFrequencies);
        }
Example #34
0
        private static int[] CalculateCandidatesFrequencies <T>(IList <List <int> > candidates, BitmapWrapper bitmapWrapper, double borderValue)
        {
            using (var cuda = new CUDA(0, true))
            {
                var elementsFrequencies = new int[candidates.Count];

                var inputSets = new int[candidates.Count * candidates[0].Count];
                var ind       = 0;
                foreach (var candidate in candidates)
                {
                    foreach (var i in candidate)
                    {
                        inputSets[ind] = i;
                        ind++;
                    }
                }

                var path = Path.Combine(Path.GetDirectoryName(Assembly.GetExecutingAssembly().Location),
                                        Names.CudaAprioriCountModule);
                cuda.LoadModule(path);
                var inputData         = cuda.CopyHostToDevice(bitmapWrapper.RgbValues);
                var inputSetData      = cuda.CopyHostToDevice(inputSets);
                var frequenciesOnHost = cuda.Allocate(new int[candidates.Count]);
                var sw = new Stopwatch();
                sw.Start();
                CallTheSetsFrequenciesCount(cuda, inputData, inputSetData, frequenciesOnHost, bitmapWrapper, candidates[0].Count, candidates.Count, borderValue);
                sw.Stop();

                //Console.WriteLine("CalculateCandidatesFrequencies: {0} ms", sw.ElapsedMilliseconds);
                cuda.CopyDeviceToHost(frequenciesOnHost, elementsFrequencies);

                cuda.Free(inputData);
                cuda.Free(inputSetData);
                cuda.Free(frequenciesOnHost);
                cuda.UnloadModule();

                return(elementsFrequencies);
            }
        }
Example #35
0
        private static void CuAddVec()
        {
            int N = 50000;
            uint size = (uint)N * sizeof(float);

            CUDA cuda = new CUDA(0, true);

            // load module
            cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, "structKernel.cubin"));
            CUfunction vecAddFunc = cuda.GetModuleFunction("VecAdd");

            float[] A = new float[N];
            float[] B = new float[N];
            float[] C = new float[N];
            for (int i = 0; i < A.Length; i++)
            {
                A[i] = (float)i;
                B[i] = (float)i + 0.1f;
            }

            CUdeviceptr dA = cuda.CopyHostToDevice(A);
            CUdeviceptr dB = cuda.CopyHostToDevice(B);

            CUdeviceptr dC = cuda.Allocate(A);

            int threadsPerBlock = 256;
            int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;

            //error = cuFuncSetBlockShape(vecAdd, threadsPerBlock, 1, 1);

            cuda.SetFunctionBlockShape(vecAddFunc, threadsPerBlock, 1, 1);

            int offset = 0;
            cuda.SetParameter(vecAddFunc, offset, (uint)dA.Pointer);
            offset += IntPtr.Size;
            cuda.SetParameter(vecAddFunc, offset, (uint)dB.Pointer);
            offset += IntPtr.Size;
            cuda.SetParameter(vecAddFunc, offset, (uint)dC.Pointer);
            offset += IntPtr.Size;
            cuda.SetParameter(vecAddFunc, offset, (uint)N);
            offset += sizeof(int);
            cuda.SetParameterSize(vecAddFunc, (uint)offset);

            CUevent start = cuda.CreateEvent();
            CUevent end = cuda.CreateEvent();

            cuda.RecordEvent(start);
            cuda.Launch(vecAddFunc, blocksPerGrid, 1);
            cuda.RecordEvent(end);

            cuda.SynchronizeContext();
            //cuda.SynchronizeEvent(end);

            float naiveTime = cuda.ElapsedTime(start, end);
            Console.Write("adding takes {0}ms", naiveTime);

            cuda.CopyDeviceToHost(dC, C);

            for (int i = 0; i < 10; i++)
            {
                Console.WriteLine("{0}-{1}", i, C[i]);
            }
        }
Example #36
0
        static void Main(string[] args)
        {
            // Create a new instance of CUDA class, select 1st device.
            CUDA cuda = new CUDA(0, true);

            // Prepare parameters.
            int n = 16 * 1024 * 1024;
            uint nbytes = (uint)(n * sizeof(int));
            int value = 26;

            // allocate host memory
            int[] a = new int[n];

            // allocate device memory
            CUdeviceptr d_a = cuda.Allocate<int>(a);
            CUDADriver.cuMemsetD8(d_a, 0xff, nbytes);

            // load module
            cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, "asyncAPI.ptx"));
            CUfunction func = cuda.GetModuleFunction("increment_kernel");

            // set kernel launch configuration
            cuda.SetFunctionBlockShape(func, 512, 1, 1);

            // create cuda event handles
            CUevent start = cuda.CreateEvent();
            CUevent stop = cuda.CreateEvent();

            // asynchronously issue work to the GPU (all to stream 0)
            CUstream stream = new CUstream();
            cuda.RecordEvent(start);
            cuda.CopyHostToDeviceAsync<int>(d_a, a, stream);

            // set parameters for kernel function
            cuda.SetParameter(func, 0, (uint)d_a.Pointer);
            cuda.SetParameter(func, IntPtr.Size, (uint)value);

            cuda.SetParameterSize(func, (uint)(IntPtr.Size + 4));

            // actually launch kernel
            cuda.LaunchAsync(func, n / 512, 1, stream);

            // wait for every thing to finish, then start copy back data
            cuda.CopyDeviceToHostAsync<int>(d_a, a, stream);

            cuda.RecordEvent(stop);

            // print the cpu and gpu times
            Console.WriteLine("time spent executing by the GPU: {0} ms", cuda.ElapsedTime(start, stop));

            // check the output for correctness
            if (CorrectOutput(a, value))
                Console.WriteLine("Test PASSED");
            else
                Console.WriteLine("Test FAILED");

            // release resources
            cuda.DestroyEvent(start);
            cuda.DestroyEvent(stop);
            cuda.Free(d_a);
        }
Example #37
0
        public static void Main(string[] args)
        {
            OtpNode node = new OtpNode("gen");
            OtpMbox mbox = node.createMbox(true);
            OtpErlangObject message = new OtpErlangTuple(new OtpErlangObject[] { mbox.Self, new OtpErlangAtom("new") });

            mbox.send("kernel", "pc@3di0050d", message);
            OtpErlangTuple reply = (OtpErlangTuple)mbox.receive();

            OtpErlangPid self = (OtpErlangPid)reply.elementAt(0);
            OtpErlangAtom ok = (OtpErlangAtom)reply.elementAt(1);
            OtpErlangPid pid = (OtpErlangPid)reply.elementAt(2);

            Console.WriteLine("New: {0}", ok);
            if (ok.ToString() != "ok")
            {
                return;
            }

            mbox.link(pid);

            using (CUDA cuda = new CUDA(0, true))
            {
                float deltaTime = 0.1f;
                int nextTickCount;

                using (NBody nbody = new NBody(cuda, deltaTime, 1.0f, 32))
                {
                    string script = String.Empty;

                    nbody.Initialize();

                    script += String.Format("<128,128,50> translate\n");
                    script += String.Format("/C {{moveto createsphere dup <1,1,1> setsize dup show }} def\n");

                    for (int i = 0; i < nbody.HostOldPos.Length; i++)
                    {
                        Float4 pos = nbody.HostOldPos[i];
                        script += String.Format("<{0},{1},{2}> C /b{3} exch def\n", pos.x, pos.y, pos.z, i);
                    }

                    Load(mbox, pid, script);
                    script = String.Empty;

                    nextTickCount = System.Environment.TickCount;
                    for (ulong frame = 0; frame < 300; frame++)
                    {
                        while (System.Environment.TickCount < nextTickCount);
                        nextTickCount = nextTickCount + (int)(deltaTime * 1000);

                        nbody.Update(0);
                        nbody.Swap();

                        for (int i = 0; i < nbody.HostOldPos.Length; i++)
                        {
                            Float4 pos = nbody.HostOldPos[i];
                            script += String.Format("b{3} <{0},{1},{2}> setposition \n", pos.x, pos.y, pos.z, i);
                        }

                        Load(mbox, pid, script);
                        script = String.Empty;
                    }
                }
            }

            Console.WriteLine("Hit return key to continue");
            Console.ReadLine();

            mbox.send(pid, new OtpErlangTuple(new OtpErlangObject[] { mbox.Self, new OtpErlangAtom("exit") }));
            reply = (OtpErlangTuple)mbox.receive();

            mbox.close();
            node.close();
        }
Example #38
0
        private static float[] CuDotProdEllPackTexCached()
        {
            //always the same values
            Random rnd = new Random(1);

            CUDA cuda = new CUDA(0, true);

            // load module
            CUmodule module = cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, "structKernel.cubin"));

            CUfunction structPassFunc = cuda.GetModuleFunction("DotProdEllPackCached");

            int maxRowSize = avgElements + stdElements - 1;

            Console.WriteLine("init arrays");
            Stopwatch t = Stopwatch.StartNew();
            float[] vecVals = new float[N * maxRowSize];
            int[] vecIdx = new int[N * maxRowSize];

            maxIndex = 0;
            for (int i = 0; i < N; i++)
            {
                int vecSize = avgElements + i % stdElements;

                float[] vals = Helpers.InitValues(i, vecSize, maxVal);

                //values are column-major aligment
                for (int z = 0; z < vals.Length; z++)
                {
                    int m = z * N + i;
                    vecVals[m] = vals[z];
                }

                //Array.Copy(vals,0,vecVals,i*maxRowSize,vals.Length);

                int[] index = Helpers.InitIndices(i, vecSize, ref maxIndex);
                //Array.Copy(index, 0, vecIdx, i * maxRowSize, index.Length);
                for (int z = 0; z < index.Length; z++)
                {
                    int m = z * N + i;
                    vecIdx[m] = index[z];
                }

            }

            float[] mainVec = new float[maxIndex + 1];

            for (int j = 0; j < maxRowSize; j++)
            {
                int idx = vecIdx[mainIndex + N * j];
                float val = vecVals[mainIndex + N * j];
                mainVec[idx] = val;
            }
            Console.WriteLine("Init takes {0}", t.Elapsed);
            t.Start();

            CUdeviceptr valsPtr = cuda.CopyHostToDevice(vecVals);
            CUdeviceptr idxPtr = cuda.CopyHostToDevice(vecIdx);

            CUarray cuArr = cuda.CreateArray(mainVec);
            cuda.CopyHostToArray(cuArr, mainVec, 0);

            //CUDAArrayDescriptor cuDesc = new CUDAArrayDescriptor();
            //cuDesc.Format = CUArrayFormat.Float;
            //cuDesc.NumChannels = 1;
            //cuDesc.Width = maxIndex+1;

            CUtexref cuTexRef = cuda.GetModuleTexture(module, "texRef");
            cuda.SetTextureFlags(cuTexRef, 0);

            cuda.SetTextureArray(cuTexRef, cuArr);

            float[] output = new float[N];
            CUdeviceptr dOutput = cuda.Allocate(output);

            Console.WriteLine("copy to device takes {0}", t.Elapsed);

            cuda.SetFunctionBlockShape(structPassFunc, threadsPerBlock, 1, 1);

            int offset = 0;
            cuda.SetParameter(structPassFunc, offset, valsPtr.Pointer);
            offset += IntPtr.Size;
            cuda.SetParameter(structPassFunc, offset, idxPtr.Pointer);
            offset += IntPtr.Size;

            cuda.SetParameter(structPassFunc, offset, dOutput.Pointer);
            offset += IntPtr.Size;

            cuda.SetParameter(structPassFunc, offset, (uint)maxRowSize);
            offset += sizeof(int);
            cuda.SetParameter(structPassFunc, offset, (uint)N);
            offset += sizeof(int);
            cuda.SetParameterSize(structPassFunc, (uint)offset);

            Console.WriteLine("start computation");

            CUevent start = cuda.CreateEvent();
            CUevent end = cuda.CreateEvent();

            Stopwatch timer = Stopwatch.StartNew();
            cuda.RecordEvent(start);
            cuda.Launch(structPassFunc, blocksPerGrid, 1);

            cuda.RecordEvent(end);

            cuda.SynchronizeContext();
            //cuda.SynchronizeEvent(end);
            timer.Stop();
            float naiveTime = cuda.ElapsedTime(start, end);

            Console.Write("EllPack Cached Dot products with mainIndex {0} and {1}-vectors takes {2} ms stopwatch time {3} ms", mainIndex, N, naiveTime, timer.Elapsed);

            cuda.CopyDeviceToHost(dOutput, output);

            int lenght = Math.Min(displayCount, N);
            Console.WriteLine();
            for (int i = 0; i < lenght; i++)
            {
                Console.WriteLine("{0}-{1}", i, output[i]);
            }

            cuda.Free(valsPtr);
            cuda.Free(idxPtr);
            cuda.Free(dOutput);
            cuda.DestroyArray(cuArr);
            cuda.DestroyTexture(cuTexRef);
            return output;
        }
Example #39
0
        //private double ComputeObj(float[] w, float[] alpha, Problem<SparseVec> sub_prob, float[] diag)
        //{
        //    double v = 0, v1=0;
        //    int nSV = 0;
        //    for (int i = 0; i < w.Length; i++)
        //    {
        //        v += w[i] * w[i];
        //        v1 += 0.5*w[i] * w[i];
        //    }
        //    for (int i = 0; i < alpha.Length; i++)
        //    {
        //        sbyte y_i = (sbyte)sub_prob.Y[i];

        //        //original line
        //        //v += alpha[i] * (alpha[i] * diag[GETI(y_i, i)] - 2);
        //        v += alpha[i] * (alpha[i] * diag[y_i + 1] - 2);
        //        v1 += 0.5* alpha[i] * (alpha[i] * diag[y_i + 1] - 2);
        //        if (alpha[i] > 0) ++nSV;
        //    }

        //    v = v / 2;
        //  //  Debug.WriteLine("Objective value = {0}", v);
        //  //  Debug.WriteLine("nSV = {0}", nSV);

        //    return v;
        //}



        protected void InitCudaModule()
        {
            cuda = new CUDA(0, true);
            cuModule = cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, cudaModuleName));
            cuFuncDotProd = cuda.GetModuleFunction(cudaProductKernelName);
            cuFuncSolver = cuda.GetModuleFunction(cudaSolveL2SVM);
            cuFuncUpdateW = cuda.GetModuleFunction(cudaUpdateW);
        }
Example #40
0
        private static unsafe void CuStructPass()
        {
            int N = 4;

            int sparseVecSize = sizeof(SparseVecPtr);

            uint size = (uint)(N * sizeof(SparseVecPtr));

            CUDA cuda = new CUDA(0, true);

            // load module
            cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, "structKernel.cubin"));
            CUfunction structPassFunc = cuda.GetModuleFunction("StructPass");

            SparseVecPtr[] vectors = new SparseVecPtr[N];

            for (int i = 0; i < N; i++)
            {
                vectors[i] = new SparseVecPtr();
                vectors[i].size = 2;
                float[] vals = new float[2] { (float)i + 1 % 5, (float)i + 2 % 7 };

                //GCHandle valHandle = GCHandle.Alloc(vals, GCHandleType.Pinned);
                //vectors[i].values = valHandle.AddrOfPinnedObject();

                int[] index = new int[2] { i % 5, i % 7 };
                //GCHandle idxHandle = GCHandle.Alloc(index, GCHandleType.Pinned);
                //vectors[i].indices = idxHandle.AddrOfPinnedObject();

                //valHandle.Free();
                //idxHandle.Free();

                CUdeviceptr valsPtr = cuda.CopyHostToDevice(vals);
                CUdeviceptr idxPtr = cuda.CopyHostToDevice(index);

                vectors[i].indices = new IntPtr(idxPtr.Pointer);
                vectors[i].values = (IntPtr)valsPtr.Pointer;

            }

            GCHandle handle = GCHandle.Alloc(vectors, GCHandleType.Pinned);
            IntPtr ptr = handle.AddrOfPinnedObject();

            float[] output = new float[N];

            //CUdeviceptr dVectors = cuda.CopyHostToDevice(vectors);

            CUdeviceptr dVectors = cuda.CopyHostToDevice(ptr, size);
            CUdeviceptr dOutput = cuda.Allocate(output);

            int threadsPerBlock = 256;
            int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;

            //error = cuFuncSetBlockShape(vecAdd, threadsPerBlock, 1, 1);

            cuda.SetFunctionBlockShape(structPassFunc, threadsPerBlock, 1, 1);

            int offset = 0;
            cuda.SetParameter(structPassFunc, offset, (uint)dVectors.Pointer);
            offset += IntPtr.Size;
            cuda.SetParameter(structPassFunc, offset, (uint)dOutput.Pointer);
            offset += IntPtr.Size;
            cuda.SetParameter(structPassFunc, offset, (uint)N);
            offset += sizeof(int);
            cuda.SetParameterSize(structPassFunc, (uint)offset);

            CUevent start = cuda.CreateEvent();
            CUevent end = cuda.CreateEvent();

            cuda.RecordEvent(start);
            cuda.Launch(structPassFunc, blocksPerGrid, 1);
            cuda.RecordEvent(end);

            cuda.SynchronizeContext();
            //cuda.SynchronizeEvent(end);

            float naiveTime = cuda.ElapsedTime(start, end);
            Console.Write("passing struct takes {0}ms", naiveTime);

            cuda.CopyDeviceToHost(dOutput, output);

            int lenght = Math.Min(10, N);
            Console.WriteLine();
            for (int i = 0; i < lenght; i++)
            {
                Console.WriteLine("{0}-{1}", i, output[i]);
            }
        }
Example #41
0
		static private void Worker(object cState)
		{
			try
			{
				Command cCmd;
				CUDA cCUDA = new CUDA(true);
				for (int i = 0; i < 10; i++)
				{
					try
					{
						cCUDA.CreateContext(i);
						(new Logger()).WriteDebug2(i + ": success");
						break;
					}
					catch (Exception ex)
					{
                        (new Logger()).WriteDebug2(i + ": failed");
                        if (Logger.bDebug && Logger.Level.debug3 > Logger.eLevelMinimum)
                            (new Logger()).WriteError(ex);
                    }
				}
				uint nMemoryReservedForMerge = 2 * 1024 * 1024; //PREFERENCES типа <memory reserved="2097152" />
				uint nMemoryStarvationThreshold = cCUDA.TotalMemory / 2; //PREFERENCES через проценты... типа <memory starvation="50%" />
				uint nMemoryFree;
                string sModule = "CUDAFunctions_" + Preferences.nCUDAVersion + "_x" + (IntPtr.Size * 8);
                if (Logger.bDebug)
                    (new Logger()).WriteDebug3(sModule);
                cCUDA.LoadModule((byte[])Properties.Resource.ResourceManager.GetObject(sModule)); //   $(ProjectDir)Resources\CUDAFunctions.cubin
				//cCUDA.LoadModule(@"c:\projects\!helpers\video\PixelsMap\Resources\CUDAFunctions.cubin");
				CUfunction cCUDAFuncMerge = cCUDA.GetModuleFunction("CUDAFrameMerge");
				int nThreadsPerBlock = 256; //пришлось уменьшить с 512 до 256 сридов на блок, потому что при добавлении "движения" и операций с float, ловил ошибку: Too Many Resources Requested for Launch (This error means that the number of registers available on the multiprocessor is being exceeded. Reduce the number of threads per block to solve the problem)
				cCUDA.SetFunctionBlockShape(cCUDAFuncMerge, nThreadsPerBlock, 1, 1);
				CUDADriver.cuParamSetSize(cCUDAFuncMerge, 8);

				Dictionary<ulong, CUdeviceptr> ahDevicePointers = new Dictionary<ulong, CUdeviceptr>();
				CUdeviceptr cPMs;
				CUdeviceptr cInfos;
				CUdeviceptr cAlphaMap;
				{
					//IntPtr[] aPointersByAlpha = new IntPtr[254];  //те самые поинтеры-альфы. Ссылаются на массивы поинтеров B, т.е. BackGrounds
					//IntPtr[] aPointersByBackground = new IntPtr[256];   //  те самые массивы поинтеров B, т.е. BackGrounds
					byte[] aAlphaMap = new byte[16646144];
					int nResult, nIndx = 0;
					for (byte nAlpha = 1; 255 > nAlpha; nAlpha++)
					{
						for (ushort nBackground = 0; 256 > nBackground; nBackground++)
						{
							for (ushort nForeground = 0; 256 > nForeground; nForeground++)
							{
								if (255 < (nResult = (int)((float)(nAlpha * (nForeground - nBackground)) / 255 + nBackground + 0.5)))
									nResult = 255;
								aAlphaMap[nIndx++] = (byte)nResult;
							}
							//aPointersByBackground[nBackground] = (IntPtr)cCUDA.CopyHostToDevice<byte>(aResults).Pointer;
						}
						//aPointersByAlpha[nAlpha - 1] = (IntPtr)cCUDA.CopyHostToDevice<IntPtr>(aPointersByBackground).Pointer;
					}
					cAlphaMap = cCUDA.CopyHostToDevice<byte>(aAlphaMap);
				}
				//{
				//    IntPtr[] aPointersByAlpha = new IntPtr[254];  //те самые поинтеры-альфы. Ссылаются на массивы поинтеров B, т.е. BackGrounds
				//    IntPtr[] aPointersByBackground = new IntPtr[256];   //  те самые массивы поинтеров B, т.е. BackGrounds
				//    byte[] aResults = new byte[256];
				//    int nResult;
				//    for (byte nAlpha = 1; 255 > nAlpha; nAlpha++)
				//    {
				//        for (ushort nBackground = 0; 256 > nBackground; nBackground++)
				//        {
				//            for (ushort nForeground = 0; 256 > nForeground; nForeground++)
				//            {
				//                if (255 < (nResult = (int)((float)(nAlpha * (nForeground - nBackground)) / 255 + nBackground + 0.5)))
				//                    nResult = 255;
				//                aResults[nForeground] = (byte)nResult;
				//            }
				//            aPointersByBackground[nBackground] = (IntPtr)cCUDA.CopyHostToDevice<byte>(aResults).Pointer;
				//        }
				//        aPointersByAlpha[nAlpha - 1] = (IntPtr)cCUDA.CopyHostToDevice<IntPtr>(aPointersByBackground).Pointer;
				//    }
				//    cAlphaMap = cCUDA.CopyHostToDevice<IntPtr>(aPointersByAlpha);
				//}

#if DEBUG
				Dictionary<ulong, DateTime> ahDebug = new Dictionary<ulong,DateTime>();
#endif
				DateTime dtNextTime = DateTime.MinValue, dtNow;
				long nStartTick; // logging
				while (true)
				{
					if (1 > _aqCommands.CountGet() && (dtNow = DateTime.Now) > dtNextTime)
					{
						dtNextTime = dtNow.AddSeconds(60);
#if DEBUG
						dtNow = dtNow.Subtract(TimeSpan.FromHours(2));
						string sMessage = "";
						foreach (ulong nID in ahDebug.Keys)
							if (dtNow > ahDebug[nID])
								sMessage += "<br>[" + nID + ":" + ahDebug[nID].ToString("HH:mm:ss") + "]";
#endif
						(new Logger()).WriteDebug("CUDA free memory:" + cCUDA.FreeMemory
#if DEBUG
							+ "; possibly timeworn allocations:" + (1 > sMessage.Length ? "no" : sMessage)
#endif
						);
					}
					while (true)
					{
						try
						{
							cCmd = _aqCommands.Dequeue();  //если нечего отдать - заснёт
							break;
						}
						catch (Exception ex)
						{
							(new Logger()).WriteError(ex);
						}
					}
					_CommandsCount = _aqCommands.nCount;
					switch (cCmd.eID)
					{
						case Command.ID.Allocate:
							#region
							try
							{
								cCmd.cPM._cException = null;
								if (1 > cCmd.cPM._nID)
								{
									if (0 < cCmd.cPM._nBytesQty)
									{
										nMemoryFree = cCUDA.FreeMemory;
										if (nMemoryReservedForMerge < nMemoryFree - cCmd.cPM._nBytesQty)
										{
											bMemoryStarvation = (nMemoryFree < nMemoryStarvationThreshold);
											cCmd.cPM._nID = _nCurrentID++;
											ahDevicePointers.Add(cCmd.cPM._nID, cCUDA.Allocate(cCmd.cPM._nBytesQty));
#if DEBUG
											ahDebug.Add(cCmd.cPM._nID, DateTime.Now);
#endif
										}
										else
										{
											bMemoryStarvation = true;
											throw new Exception("out of memory in CUDA device during Allocate. Only 2 MBytes reserved for the Merge");
										}
									}
									else
										throw new Exception("bytes quantity in PixelsMap have to be greater than zero for Allocate [_bDisposed = " + cCmd.cPM._bDisposed + "][_bProcessing = " + cCmd.cPM._bProcessing + "][_bShiftVertical = " + cCmd.cPM._bShiftVertical + "][_bTemp = " + cCmd.cPM._bTemp + "][_dt = " + cCmd.cPM._dt + "][_nBytesQty = " + cCmd.cPM._nBytesQty + "][_nID = " + cCmd.cPM._nID + "][_nShiftPosition = " + cCmd.cPM._nShiftPosition + "][_stArea.nHeight = " + cCmd.cPM._stArea.nHeight + "][_stArea.nWidth = " + cCmd.cPM._stArea.nWidth + "][bKeepAlive = " + cCmd.cPM.bKeepAlive + "][bBackgroundClear = " + cCmd.cPM.bBackgroundClear + "][eAlpha = " + cCmd.cPM.eAlpha + "][bCUDA = " + cCmd.cPM.bCUDA + "][nAlphaConstant = " + cCmd.cPM.nAlphaConstant + "][nID = " + cCmd.cPM.nID + "][nLength = " + cCmd.cPM.nLength + "][stArea.nHeight = " + cCmd.cPM.stArea.nHeight + "][stArea.nWidth = " + cCmd.cPM.stArea.nWidth + "]");
								}
								else
									throw new Exception("PixelsMap ID have to be zero for Allocate");
							}
							catch (Exception ex)
							{
								if (ex is CUDAException)
									ex = new Exception("CUDA Error:" + ((CUDAException)ex).CUDAError.ToString(), ex);
								(new Logger()).WriteError(ex);
								(new Logger()).WriteDebug("bytes qty:" + cCmd.cPM._nBytesQty);
								cCmd.cPM._cException = ex;
							}
							cCmd.cMRE.Set();
							break;
							#endregion
						case Command.ID.CopyIn:
							#region
							nStartTick = DateTime.Now.Ticks; // logging
							try
							{
								cCmd.cPM._cException = null;
								if (1 > cCmd.cPM._nID)
								{
									if (cCUDA.FreeMemory - cCmd.cPM._nBytesQty > nMemoryReservedForMerge)
									{
										cCmd.cPM._nID = _nCurrentID++;
										if (cCmd.ahParameters.ContainsKey(typeof(IntPtr)))
											ahDevicePointers.Add(cCmd.cPM._nID, cCUDA.CopyHostToDevice((IntPtr)cCmd.ahParameters[typeof(IntPtr)], cCmd.cPM._nBytesQty));
										else if (cCmd.ahParameters.ContainsKey(typeof(byte[])))
											ahDevicePointers.Add(cCmd.cPM._nID, cCUDA.CopyHostToDevice((byte[])cCmd.ahParameters[typeof(byte[])]));
										else
											throw new Exception("unknown parameter type");
#if DEBUG
											ahDebug.Add(cCmd.cPM._nID, DateTime.Now);
#endif
									}
									else
										throw new Exception("out of memory in CUDA device during CopyIn. Only 2 MBytes reserved for the Merge.");
								}
								else
								{
									if (cCmd.ahParameters.ContainsKey(typeof(IntPtr)))
										cCUDA.CopyHostToDevice(ahDevicePointers[cCmd.cPM._nID], (IntPtr)cCmd.ahParameters[typeof(IntPtr)], cCmd.cPM._nBytesQty);
									else if (cCmd.ahParameters.ContainsKey(typeof(byte[])))
										cCUDA.CopyHostToDevice(ahDevicePointers[cCmd.cPM._nID], (byte[])cCmd.ahParameters[typeof(byte[])]);
									else
										throw new Exception("unknown parameter type");
								}
								if (ahDevicePointers.ContainsKey(cCmd.cPM._nID))
									(new Logger()).WriteDebug5("copy in [id:" + cCmd.cPM._nID + "][ptr:" + ahDevicePointers[cCmd.cPM._nID].Pointer + "]");
								else
									(new Logger()).WriteDebug5("copy in [id:" + cCmd.cPM._nID + "][ptr: not in dictionary]");
							}
							catch (Exception ex)
							{
								if (ex is CUDAException)
									ex = new Exception("CUDA Error:" + ((CUDAException)ex).CUDAError.ToString(), ex);
								(new Logger()).WriteError(ex);
								cCmd.cPM._cException = ex;
							}
							if (new TimeSpan(DateTime.Now.Ticks - nStartTick).TotalMilliseconds >= 20)    // logging
								(new Logger()).WriteNotice("PixelMap: Command.ID.CopyIn: execution time > 20ms: " + new TimeSpan(DateTime.Now.Ticks - nStartTick).TotalMilliseconds +"ms");    // logging
							cCmd.cMRE.Set();
							break;
							#endregion
						case Command.ID.CopyOut:
							#region
							nStartTick = DateTime.Now.Ticks; // logging
							try
							{
								if (0 < cCmd.cPM._nID)
								{
									if(!cCmd.ahParameters.ContainsKey(typeof(IntPtr)))
									{
										if(cCmd.ahParameters.ContainsKey(typeof(byte[])))
										{
											cCmd.cPM._aBytes = (byte[])cCmd.ahParameters[typeof(byte[])];
											if(cCmd.cPM._nBytesQty != cCmd.cPM._aBytes.Length)
												(new Logger()).WriteWarning("wrong array size for copyout [got:" + cCmd.cPM._aBytes.Length + "][expected:" + cCmd.cPM._nBytesQty + "]");
										}
										else
											cCmd.cPM._aBytes = new byte[cCmd.cPM._nBytesQty];
										cCUDA.CopyDeviceToHost<byte>(ahDevicePointers[cCmd.cPM._nID], cCmd.cPM._aBytes);
									}
									else 
										cCUDA.CopyDeviceToHost(ahDevicePointers[cCmd.cPM._nID], (IntPtr)cCmd.ahParameters[typeof(IntPtr)], cCmd.cPM._nBytesQty);
									(new Logger()).WriteDebug5("copy out [id:" + cCmd.cPM._nID + "][ptr:" + ahDevicePointers[cCmd.cPM._nID].Pointer + "]");
								}
								else
									throw new Exception("PixelsMap have to be allocated for CopyOut");
							}
							catch (Exception ex)
							{
								if (ex is CUDAException)
									ex = new Exception("CUDA Error:" + ((CUDAException)ex).CUDAError.ToString(), ex);
								(new Logger()).WriteError(ex);
								cCmd.cPM._cException = ex;
							}
							if (new TimeSpan(DateTime.Now.Ticks - nStartTick).TotalMilliseconds >= 20)    // logging
								(new Logger()).WriteNotice("PixelMap: Command.ID.CopyOut: execution time > 20ms: " + new TimeSpan(DateTime.Now.Ticks - nStartTick).TotalMilliseconds +"ms");    // logging
							cCmd.cMRE.Set();
							break;
							#endregion
						case Command.ID.Merge:
							#region
							try
							{
								List<PixelsMap> aPMs = (List<PixelsMap>)cCmd.ahParameters[typeof(List<PixelsMap>)];
								DisCom.MergeInfo cMergeInfo = (DisCom.MergeInfo)cCmd.ahParameters[typeof(DisCom.MergeInfo)];
								List<IntPtr> aDPs = new List<IntPtr>();

								if (1 > cCmd.cPM._nID)
									throw new Exception("background PixelsMap have to be allocated for Merge");

								aDPs.Add((IntPtr)ahDevicePointers[cCmd.cPM._nID].Pointer);
								for (int nIndx = 0; nIndx < aPMs.Count; nIndx++)
								{
									if (!ahDevicePointers.ContainsKey(aPMs[nIndx]._nID))
										throw new Exception("there is a corrupted ID in layers for merge [id:" + aPMs[nIndx]._nID + "]");
									if (1 > ahDevicePointers[aPMs[nIndx]._nID].Pointer)
										throw new Exception("there is an empty pointer in layers for merge [id:" + aPMs[nIndx]._nID + "]");
									aDPs.Add((IntPtr)ahDevicePointers[aPMs[nIndx]._nID].Pointer);
								}

								cPMs = cCUDA.CopyHostToDevice<IntPtr>(aDPs.ToArray());
								cInfos = cCUDA.CopyHostToDevice(cMergeInfo, cMergeInfo.SizeGet());

								cCUDA.SetParameter<IntPtr>(cCUDAFuncMerge, 0, (IntPtr)cPMs.Pointer);
								cCUDA.SetParameter<IntPtr>(cCUDAFuncMerge, IntPtr.Size, (IntPtr)cInfos.Pointer);
								cCUDA.SetParameter<IntPtr>(cCUDAFuncMerge, IntPtr.Size * 2, (IntPtr)cAlphaMap.Pointer);
								cCUDA.SetParameterSize(cCUDAFuncMerge, (uint)(IntPtr.Size * 3));
								int nIterations = (0 == cMergeInfo.nBackgroundSize % nThreadsPerBlock ? cMergeInfo.nBackgroundSize / nThreadsPerBlock : cMergeInfo.nBackgroundSize / nThreadsPerBlock + 1);
								cCUDA.Launch(cCUDAFuncMerge, nIterations, 1);
								cCmd.cMRE.Set();

                                cMergeInfo.Dispose();

								cCUDA.Free(cPMs);
								cCUDA.Free(cInfos);
								for (int nIndx = 0; nIndx < aPMs.Count; nIndx++)
								{
									lock (aPMs[nIndx]._cSyncRoot)
										aPMs[nIndx]._bProcessing = false;
									aPMs[nIndx].Dispose();
								}
							}
							catch (Exception ex)
							{
								cCmd.cMRE.Set();
								if (ex is CUDAException)
									ex = new Exception("CUDA Error:" + ((CUDAException)ex).CUDAError.ToString(), ex);
								(new Logger()).WriteError(ex);
								cCmd.cPM._cException = ex;
							}
							break;
							#endregion
						case Command.ID.Dispose:
							#region
							nStartTick = DateTime.Now.Ticks; // logging
							(new Logger()).Write(Logger.Level.debug2, "dispose: in");
							try
							{
								if (ahDevicePointers.ContainsKey(cCmd.cPM._nID))
								{
									if (0 < cCmd.cPM._nID && 0 < ahDevicePointers[cCmd.cPM._nID].Pointer)
									{
										cCUDA.Free(ahDevicePointers[cCmd.cPM._nID]);
										//cCUDA.SynchronizeContext();
										bMemoryStarvation = (cCUDA.FreeMemory < nMemoryStarvationThreshold);
										(new Logger()).WriteDebug3("dispose [id:" + cCmd.cPM._nID + "][ptr:" + ahDevicePointers[cCmd.cPM._nID].Pointer + "]");
									}
									ahDevicePointers.Remove(cCmd.cPM._nID);
#if DEBUG
									ahDebug.Remove(cCmd.cPM._nID);
#endif
									cCmd.cPM._nID = 0;
								}
							}
							catch (Exception ex)
							{
								if (ex is CUDAException)
									ex = new Exception("CUDA Error:" + ((CUDAException)ex).CUDAError.ToString(), ex);
								(new Logger()).WriteError(ex);
								cCmd.cPM._cException = ex;
							}
							(new Logger()).Write(Logger.Level.debug2, "dispose: out");
							if (new TimeSpan(DateTime.Now.Ticks - nStartTick).TotalMilliseconds >= 20)    // logging
								(new Logger()).WriteNotice("PixelMap: Command.ID.Dispose: execution time > 20ms: " + new TimeSpan(DateTime.Now.Ticks - nStartTick).TotalMilliseconds +"ms");    // logging
							break;
							#endregion
					}
				}
			}
			catch (Exception ex)
			{
				(new Logger()).WriteError(ex);
			}
		}
Example #42
0
        static void Main(string[] args)
        {
            // Init and select 1st device.
            CUDA cuda = new CUDA(0, true);

            // load module
            //cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, "simpleCUFFT.ptx"));
            CUfunction func = new CUfunction();// cuda.GetModuleFunction("ComplexPointwiseMulAndScale");

            // The filter size is assumed to be a number smaller than the signal size
            const int SIGNAL_SIZE = 50;
            const int FILTER_KERNEL_SIZE = 11;

            // Allocate host memory for the signal
            Float2[] h_signal = new Float2[SIGNAL_SIZE];
            // Initalize the memory for the signal
            Random r = new Random();
            for (int i = 0; i < SIGNAL_SIZE; ++i)
            {
                h_signal[i].x = r.Next() / (float)int.MaxValue;
                h_signal[i].y = 0;
            }

            // Allocate host memory for the filter
            Float2[] h_filter_kernel = new Float2[FILTER_KERNEL_SIZE];
            // Initalize the memory for the filter
            for (int i = 0; i < FILTER_KERNEL_SIZE; ++i)
            {
                h_filter_kernel[i].x = r.Next() / (float)int.MaxValue;
                h_filter_kernel[i].y = 0;
            }

            // Pad signal and filter kernel
            Float2[] h_padded_signal;
            Float2[] h_padded_filter_kernel;
            int new_size = PadData(h_signal, out h_padded_signal, SIGNAL_SIZE,
                                   h_filter_kernel, out h_padded_filter_kernel, FILTER_KERNEL_SIZE);

            // Allocate device memory for signal
            // Copy host memory to device
            CUdeviceptr d_signal = cuda.CopyHostToDevice<Float2>(h_padded_signal);

            // Allocate device memory for filter kernel
            // Copy host memory to device
            CUdeviceptr d_filter_kernel = cuda.CopyHostToDevice<Float2>(h_padded_filter_kernel);

            // CUFFT plan
            CUFFT fft = new CUFFT(cuda);
            cufftHandle handle = new cufftHandle();
            CUFFTResult fftres = CUFFTDriver.cufftPlan1d(ref handle, new_size, CUFFTType.C2C, 1);
            //fft.Plan1D(new_size, CUFFTType.C2C, 1);


            return;

            // Transform signal and kernel
            fft.ExecuteComplexToComplex(d_signal, d_signal, CUFFTDirection.Forward);
            fft.ExecuteComplexToComplex(d_filter_kernel, d_filter_kernel, CUFFTDirection.Forward);

            // Multiply the coefficients together and normalize the result
            // ComplexPointwiseMulAndScale<<<32, 256>>>(d_signal, d_filter_kernel, new_size, 1.0f / new_size);
            cuda.SetFunctionBlockShape(func, 256, 1, 1);
            cuda.SetParameter(func, 0, (uint)d_signal.Pointer);
            cuda.SetParameter(func, IntPtr.Size, (uint)d_filter_kernel.Pointer);
            cuda.SetParameter(func, IntPtr.Size * 2, (uint)new_size);
            cuda.SetParameter(func, IntPtr.Size * 2 + 4, 1.0f / new_size);
            cuda.SetParameterSize(func, (uint)(IntPtr.Size * 2 + 8));
            cuda.Launch(func, 32, 1);

            // Transform signal back
            fft.ExecuteComplexToComplex(d_signal, d_signal, CUFFTDirection.Inverse);

            // Copy device memory to host
            Float2[] h_convolved_signal = h_padded_signal;
            cuda.CopyDeviceToHost<Float2>(d_signal, h_convolved_signal);

            // Allocate host memory for the convolution result
            Float2[] h_convolved_signal_ref = new Float2[SIGNAL_SIZE];

            // Convolve on the host
            Convolve(h_signal, SIGNAL_SIZE,
                     h_filter_kernel, FILTER_KERNEL_SIZE,
                     h_convolved_signal_ref);

            // check result
            bool res = cutCompareL2fe(h_convolved_signal_ref, h_convolved_signal, 2 * SIGNAL_SIZE, 1e-5f);
            Console.WriteLine("Test {0}", (true == res) ? "PASSED" : "FAILED");

            //Destroy CUFFT context
            fft.Destroy();

            // cleanup memory
            cuda.Free(d_signal);
            cuda.Free(d_filter_kernel);
        }
Example #43
0
        private static float[] CuRBFCSRCached()
        {
            //always the same values
            Random rnd = new Random(1);

            CUDA cuda = new CUDA(0, true);

            // load module
            CUmodule module = cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, "structKernel.cubin"));

            CUfunction structPassFunc = cuda.GetModuleFunction("RBFspmv_csr_vector");

            int maxRowSize = avgElements + stdElements - 1;

            Console.WriteLine("init arrays");
            Stopwatch t = Stopwatch.StartNew();
            List<float> vecValsL = new List<float>(N * maxRowSize / 2);
            List<int> vecIdxL = new List<int>(N * maxRowSize / 2);
            List<int> vecLenghtL = new List<int>(N);

            float[] vecVals;
            int[] vecIdx;
            int[] vecLenght;
            float[] selfDot = new float[N];

            maxIndex = 0;
            int vecStartIdx = 0;
            for (int i = 0; i < N; i++)
            {
                int vecSize = avgElements + i % stdElements;

                float[] vals = Helpers.InitValues(i, vecSize, maxVal);
                vecValsL.AddRange(vals);

                for (int z = 0; z < vals.Length; z++)
                {
                    selfDot[i] += vals[z] * vals[z];
                }
                int[] index = Helpers.InitIndices(i, vecSize, ref maxIndex);
                vecIdxL.AddRange(index);

                vecLenghtL.Add(vecStartIdx);
                vecStartIdx += vecSize;

            }
            //for last index
            vecLenghtL.Add(vecStartIdx);

            vecVals = vecValsL.ToArray();
            vecIdx = vecIdxL.ToArray();
            vecLenght = vecLenghtL.ToArray();

            float[] mainVec = new float[maxIndex + 1];

            for (int j = vecLenght[mainIndex]; j < vecLenght[mainIndex + 1]; j++)
            {
                int idx = vecIdx[j];
                float val = vecVals[j];
                mainVec[idx] = val;
            }
            Console.WriteLine("Init takes {0}", t.Elapsed);
            t.Start();

            CUdeviceptr valsPtr = cuda.CopyHostToDevice(vecVals);
            CUdeviceptr idxPtr = cuda.CopyHostToDevice(vecIdx);
            CUdeviceptr vecLenghtPtr = cuda.CopyHostToDevice(vecLenght);
            CUdeviceptr selfDotPtr = cuda.CopyHostToDevice(selfDot);

            //copy to texture
            CUarray cuArr = cuda.CreateArray(mainVec);
            cuda.CopyHostToArray(cuArr, mainVec, 0);
            CUtexref cuTexRef = cuda.GetModuleTexture(module, "texRef");
            cuda.SetTextureFlags(cuTexRef, 0);
            cuda.SetTextureArray(cuTexRef, cuArr);

            float[] output = new float[N];
            CUdeviceptr dOutput = cuda.Allocate(output);

            Console.WriteLine("copy to device takes {0}", t.Elapsed);

            cuda.SetFunctionBlockShape(structPassFunc, threadsPerBlock, 1, 1);

            int offset = 0;
            cuda.SetParameter(structPassFunc, offset, valsPtr.Pointer);
            offset += IntPtr.Size;
            cuda.SetParameter(structPassFunc, offset, idxPtr.Pointer);
            offset += IntPtr.Size;

            cuda.SetParameter(structPassFunc, offset, vecLenghtPtr.Pointer);
            offset += IntPtr.Size;
            cuda.SetParameter(structPassFunc, offset, selfDotPtr.Pointer);
            offset += IntPtr.Size;

            cuda.SetParameter(structPassFunc, offset, dOutput.Pointer);
            offset += IntPtr.Size;

            cuda.SetParameter(structPassFunc, offset, (uint)N);
            offset += sizeof(int);
            cuda.SetParameter(structPassFunc, offset, (uint)mainIndex);
            offset += sizeof(int);
            cuda.SetParameter(structPassFunc, offset, Gamma);
            offset += sizeof(float);

            cuda.SetParameter(structPassFunc, offset, (uint)vecStartIdx);
            offset += sizeof(int);
            cuda.SetParameterSize(structPassFunc, (uint)offset);

            Console.WriteLine("start computation");

            CUevent start = cuda.CreateEvent();
            CUevent end = cuda.CreateEvent();

            Stopwatch timer = Stopwatch.StartNew();
            cuda.RecordEvent(start);
            cuda.Launch(structPassFunc, blocksPerGrid, 1);

            cuda.RecordEvent(end);

            cuda.SynchronizeContext();
            //cuda.SynchronizeEvent(end);
            timer.Stop();
            float naiveTime = cuda.ElapsedTime(start, end);

            Console.Write("csr vector Dot products with mainIndex {0} and {1}-vectors takes {2} ms stopwatch time {3} ms", mainIndex, N, naiveTime, timer.Elapsed);

            cuda.CopyDeviceToHost(dOutput, output);

            int lenght = Math.Min(displayCount, N);
            Console.WriteLine();
            for (int i = 0; i < lenght; i++)
            {
                Console.WriteLine("{0}-{1}", i, output[i]);
            }

            cuda.Free(valsPtr);
            cuda.Free(idxPtr);
            cuda.Free(dOutput);
            cuda.Free(selfDotPtr);
            cuda.Free(vecLenghtPtr);
            cuda.DestroyArray(cuArr);
            cuda.DestroyTexture(cuTexRef);
            cuda.DestroyEvent(start);
            cuda.DestroyEvent(end);

            return output;
        }
        public static float[] CRSSparseMMwithDenseVector(int repetition,
            string moduleFunction, int blockSizeX, int blockSizeY)
        {
            CUDA cuda = new CUDA(0, true);

            // load module

            CUmodule module = cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, "matrixKernels.cubin"));

            CUfunction cuFunc = cuda.GetModuleFunction(moduleFunction);

            int maxRowSize = avgElements + stdElements - 1;

            Console.WriteLine("------------------------------------");
            Console.WriteLine("init Matrix");
            Stopwatch t = Stopwatch.StartNew();

            //values in CRS format
            float[] AVals, BVals;
            //indexes in Crs format
            int[] AIdx, BIdx;
            //Lenght of each row in CRS format
            int[] ARowLen, BRowLen;

            int maxIndex = 0;
            MakeRandCrsSparseMatrix(Rows, maxRowSize, out AVals, out AIdx, out ARowLen, out maxIndex);

            // DisplayCrsMatrix(AVals, AIdx, ARowLen,maxIndex);
            MakeRandCrsSparseMatrix(Cols, maxRowSize, out BVals, out BIdx, out BRowLen, out maxIndex);
            //DisplayCrsMatrix(BVals, BIdx, BRowLen, maxIndex);

            Console.WriteLine("Init takes {0}", t.Elapsed);
            t.Start();

            CUdeviceptr AValsPtr = cuda.CopyHostToDevice(AVals);
            CUdeviceptr AIdxPtr = cuda.CopyHostToDevice(AIdx);
            CUdeviceptr ALenghtPtr = cuda.CopyHostToDevice(ARowLen);

            int outputSize = Rows * Cols;
            float[] output = new float[outputSize];

            //allocate memory for output
            IntPtr outputPtr2 = cuda.HostAllocate((uint)(outputSize * sizeof(float)), CUDADriver.CU_MEMHOSTALLOC_DEVICEMAP);
            CUdeviceptr dOutput = cuda.GetHostDevicePointer(outputPtr2, 0);

            //create dense vector for each column in B matrix
            float[] mainVec = new float[maxIndex + 1];

            uint memSize = (uint)((maxIndex + 1) * sizeof(float));

            CUstream stream0 =cuda.CreateStream();

            IntPtr[] mainVecIntPtrs= new IntPtr[2];

            //write combined memory allocation
            //IntPtr mainVecIPtr = cuda.HostAllocate(memSize,CUDADriver.CU_MEMHOSTALLOC_WRITECOMBINED);
            //CUdeviceptr mainVecPtr=cuda.CopyHostToDeviceAsync(mainVecIPtr,memSize,stream0);

            //
            //mainVecIntPtrs[0] = cuda.HostAllocate(memSize, CUDADriver.CU_MEMHOSTALLOC_WRITECOMBINED);
            //mainVecIntPtrs[1] = cuda.HostAllocate(memSize, CUDADriver.CU_MEMHOSTALLOC_WRITECOMBINED);

            mainVecIntPtrs[0] = cuda.AllocateHost(memSize);
            mainVecIntPtrs[1] = cuda.AllocateHost(memSize);
            CUdeviceptr mainVecPtr = cuda.CopyHostToDeviceAsync(mainVecIntPtrs[0], memSize, stream0);

            //IntPtr mainVecIPtr = cuda.HostAllocate(memSize,CUDADriver.CU_MEMHOSTALLOC_PORTABLE);
            //CUdeviceptr mainVecPtr=cuda.CopyHostToDeviceAsync(mainVecIPtr,memSize,stream0);

            //mapped memory allocation
            //IntPtr mainVecIPtr = cuda.HostAllocate(memSize, CUDADriver.CU_MEMHOSTALLOC_DEVICEMAP);
            //CUdeviceptr mainVecPtr = cuda.CopyHostToDevice(mainVecIPtr, memSize);

            //get texture reference
            CUtexref cuTexRef = cuda.GetModuleTexture(module, "vectorTexRef");
            cuda.SetTextureFlags(cuTexRef, 0);
            cuda.SetTextureAddress(cuTexRef, mainVecPtr, memSize);

            Console.WriteLine("copy to device takes {0}", t.Elapsed);
            #region set cuda parameters

            int Aelements = AVals.Length;

            cuda.SetFunctionBlockShape(cuFunc, blockSizeX, blockSizeY, 1);

            int offset = 0;
            cuda.SetParameter(cuFunc, offset, AValsPtr.Pointer);
            offset += IntPtr.Size;
            cuda.SetParameter(cuFunc, offset, AIdxPtr.Pointer);
            offset += IntPtr.Size;
            cuda.SetParameter(cuFunc, offset, ALenghtPtr.Pointer);
            offset += IntPtr.Size;

            cuda.SetParameter(cuFunc, offset, dOutput.Pointer);
            offset += IntPtr.Size;

            cuda.SetParameter(cuFunc, offset, (uint)Rows);
            offset += sizeof(int);
            cuda.SetParameter(cuFunc, offset, (uint)Cols);
            offset += sizeof(int);

            int colIndexParamOffset = offset;
            cuda.SetParameter(cuFunc, offset, (uint)0);
            offset += sizeof(int);
            cuda.SetParameterSize(cuFunc, (uint)offset);
            #endregion
            Console.WriteLine("start computation");

            CUevent start = cuda.CreateEvent();
            CUevent end = cuda.CreateEvent();

            int gridDimX = (int)Math.Ceiling((Rows + 0.0) / (blockSizeX));
            int gridDim= (Rows + blockSizeX - 1) / blockSizeX;

            Stopwatch timer = Stopwatch.StartNew();
            cuda.RecordEvent(start);
            for (int rep = 0; rep < repetition; rep++)
            {
                for (int k = 0; k < Cols; k++)
                {

                    Helpers.InitBuffer(BVals, BIdx, BRowLen, k, mainVecIntPtrs[k % 2]);

                    cuda.SynchronizeStream(stream0);

                    cuda.CopyHostToDeviceAsync(mainVecPtr, mainVecIntPtrs[k % 2], memSize, stream0);
                    cuda.SetParameter(cuFunc, colIndexParamOffset,(uint) k);
                    cuda.LaunchAsync(cuFunc, gridDimX, 1, stream0);
                    //cuda.SynchronizeStream(stream0);
                    ////clear host buffer
                    Helpers.SetBufferIdx(BIdx, BRowLen, k-1, mainVecIntPtrs[(k+1) % 2], 0.0f);

                    //Helpers.InitBuffer(BVals, BIdx, BRowLen, k, mainVecIPtr);
                    ////make asynchronius copy and kernel lauch
                    //cuda.CopyHostToDeviceAsync(mainVecPtr, mainVecIPtr, memSize, stream0);
                    //cuda.SetParameter(cuFunc, colIndexParamOffset,(uint) k);
                    //cuda.LaunchAsync(cuFunc, gridDimX, 1, stream0);
                    //cuda.SynchronizeStream(stream0);
                    ////clear host buffer
                    //Helpers.SetBufferIdx(BIdx, BRowLen, k, mainVecIPtr, 0.0f);
                }
            }
            cuda.RecordEvent(end);
            cuda.SynchronizeContext();

            timer.Stop();
            float cudaTime = cuda.ElapsedTime(start, end);

            Marshal.Copy(outputPtr2, output, 0, outputSize);

            Console.WriteLine("Matrix products with kernel {0}", moduleFunction);
            Console.WriteLine("  takes {0} ms stopwatch time {1} ms", cudaTime, timer.Elapsed);

            int lenght = displayCount;// Math.Min(displayCount, Rows);
            Console.WriteLine();
            for (int i = 0; i < lenght; i++)
            {
                Console.WriteLine("{0}-{1}", i, output[i]);
            }

            cuda.Free(AValsPtr);
            cuda.Free(AIdxPtr);
            cuda.Free(ALenghtPtr);
            cuda.Free(dOutput);
            cuda.DestroyEvent(start);
            cuda.DestroyEvent(end);

            cuda.DestroyStream(stream0);
            cuda.Free(mainVecPtr);
            cuda.DestroyTexture(cuTexRef);

            return output;
        }
        /// <summary>
        /// implementation of sparese matrix product
        /// </summary>
        /// <param name="repetition">how many times kernel should be launch</param>
        /// <param name="moduleFunction">cuda kenrel name</param>
        /// <param name="blockSizeX">block size X</param>
        /// <param name="blockSizeY">block size Y</param>
        /// <param name="transposeGrid">indicate that grid dimensions should be 
        /// computed alternativly, if false than gridDimY- connected with rows
        /// else gridDim.Y conected with cols</param>
        /// <returns></returns>
        public static float[] CRSSparseMM(int repetition, string moduleFunction, 
            int blockSizeX,int blockSizeY, bool transposeGrid)
        {
            //int blockSizeX = 4;
            //int blockSizeY = 4;

            CUDA cuda = new CUDA(0, true);

            // load module
            CUmodule module = cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, "matrixKernels.cubin"));

            CUfunction cuFunc = cuda.GetModuleFunction(moduleFunction);

            int maxRowSize = avgElements + stdElements - 1;

            Console.WriteLine("------------------------------------");
            Console.WriteLine("init Matrix");
            Stopwatch t = Stopwatch.StartNew();

            //values in CRS format
            float[] AVals, BVals;
            //indexes in Crs format
            int[] AIdx, BIdx;
            //Lenght of each row in CRS format
            int[] ARowLen, BRowLen;
            int maxIndex = 0;
            MakeRandCrsSparseMatrix(Rows, maxRowSize, out AVals, out AIdx, out ARowLen,out maxIndex);

               // DisplayCrsMatrix(AVals, AIdx, ARowLen,maxIndex);
            MakeRandCrsSparseMatrix(Cols, maxRowSize, out BVals, out BIdx, out BRowLen,out maxIndex);
            //DisplayCrsMatrix(BVals, BIdx, BRowLen, maxIndex);

            Console.WriteLine("Init takes {0}", t.Elapsed);
            t.Start();

            CUdeviceptr AValsPtr = cuda.CopyHostToDevice(AVals);
            CUdeviceptr AIdxPtr = cuda.CopyHostToDevice(AIdx);
            CUdeviceptr ALenghtPtr = cuda.CopyHostToDevice(ARowLen);

            CUdeviceptr BValsPtr = cuda.CopyHostToDevice(BVals);
            CUdeviceptr BIdxPtr = cuda.CopyHostToDevice(BIdx);
            CUdeviceptr BLenghtPtr = cuda.CopyHostToDevice(BRowLen);

            int outputSize = Rows * Cols;
            float[] output = new float[outputSize];
            //CUdeviceptr dOutput = cuda.Allocate(output);

            IntPtr outputPtr2 = cuda.HostAllocate((uint)(outputSize * sizeof(float)), CUDADriver.CU_MEMHOSTALLOC_DEVICEMAP);
            CUdeviceptr dOutput = cuda.GetHostDevicePointer(outputPtr2, 0);

            Console.WriteLine("copy to device takes {0}", t.Elapsed);
            #region set cuda parameters

            int Aelements = AVals.Length;
            int Belements = BVals.Length;

            cuda.SetFunctionBlockShape(cuFunc,blockSizeX,blockSizeY, 1);

            int offset = 0;
            cuda.SetParameter(cuFunc, offset, AValsPtr.Pointer);
            offset += IntPtr.Size;
            cuda.SetParameter(cuFunc, offset, AIdxPtr.Pointer);
            offset += IntPtr.Size;
            cuda.SetParameter(cuFunc, offset, ALenghtPtr.Pointer);
            offset += IntPtr.Size;
            cuda.SetParameter(cuFunc, offset, BValsPtr.Pointer);
            offset += IntPtr.Size;
            cuda.SetParameter(cuFunc, offset, BIdxPtr.Pointer);
            offset += IntPtr.Size;
            cuda.SetParameter(cuFunc, offset, BLenghtPtr.Pointer);
            offset += IntPtr.Size;

            cuda.SetParameter(cuFunc, offset, dOutput.Pointer);
            offset += IntPtr.Size;

            cuda.SetParameter(cuFunc, offset, (uint)Rows);
            offset += sizeof(int);
            cuda.SetParameter(cuFunc, offset, (uint)Cols);
            offset += sizeof(int);

            cuda.SetParameter(cuFunc, offset, (uint)Aelements);
            offset += sizeof(int);
            cuda.SetParameter(cuFunc, offset, (uint)Belements);
            offset += sizeof(int);

            cuda.SetParameterSize(cuFunc, (uint)offset);
            #endregion
            Console.WriteLine("start computation");

            CUevent start = cuda.CreateEvent();
            CUevent end = cuda.CreateEvent();

            //CUtexref cuTexRef = cuda.GetModuleTexture(module, "texRef");
            //cuda.SetTextureFlags(cuTexRef, 0);

            int gridDimX =(int) Math.Ceiling((Cols + 0.0) / (blockSizeX));
            int gridDimY = (int)Math.Ceiling((0.0+Rows)/blockSizeY);
            if (transposeGrid)
            {
                gridDimX = (int)Math.Ceiling((Rows + 0.0) / (blockSizeX));
                gridDimY = (int)Math.Ceiling((0.0 + Cols) / blockSizeY);
            }

            Stopwatch timer = Stopwatch.StartNew();
            cuda.RecordEvent(start);

            for (int k = 0; k < repetition; k++)
            {
                cuda.Launch(cuFunc, gridDimX, gridDimY);

                cuda.SynchronizeContext();
               //  cuda.CopyDeviceToHost(dOutput, output);
                Marshal.Copy(outputPtr2, output, 0, outputSize);
            }

            cuda.RecordEvent(end);

            cuda.SynchronizeContext();

            timer.Stop();
            float cudaTime = cuda.ElapsedTime(start, end);

            Console.WriteLine("Matrix products with kernel {0}",moduleFunction);
            Console.WriteLine("  takes {0} ms stopwatch time {1} ms", cudaTime, timer.Elapsed);

            int lenght = displayCount;// Math.Min(displayCount, Rows);
            Console.WriteLine();
            for (int i = 0; i < lenght; i++)
            {
                Console.WriteLine("{0}-{1}", i, output[i]);
            }

            cuda.Free(AValsPtr);
            cuda.Free(AIdxPtr);
            cuda.Free(ALenghtPtr);

            cuda.Free(BValsPtr);
            cuda.Free(BIdxPtr);
            cuda.Free(BLenghtPtr);

            cuda.Free(dOutput);

            cuda.DestroyEvent(start);
            cuda.DestroyEvent(end);

            return output;
        }
Example #46
0
        static void Main(string[] args)
        {
            // Init and select 1st device.
            CUDA cuda = new CUDA(0, true);

            // load module
            //cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, "transpose_kernel.cubin"));
            cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, "transpose_kernel.ptx"));
            CUfunction transpose = cuda.GetModuleFunction("transpose");
            CUfunction transpose_naive = cuda.GetModuleFunction("transpose_naive");

            const int size_x = 4096;
            const int size_y = 4096;
            const int mem_size = sizeof(float) * size_x * size_y;

            float[] h_idata = new float[size_x * size_y];
            for (int i = 0; i < h_idata.Length; i++)
            {
                h_idata[i] = (float)i;
            }

            // allocate device memory
            // copy host memory to device
            CUdeviceptr d_idata = cuda.CopyHostToDevice<float>(h_idata);
            CUdeviceptr d_odata = cuda.Allocate<float>(h_idata);

            // setup execution parameters
            cuda.SetFunctionBlockShape(transpose_naive, BLOCK_DIM, BLOCK_DIM, 1);
            cuda.SetParameter(transpose_naive, 0, (uint)d_odata.Pointer);
            cuda.SetParameter(transpose_naive, IntPtr.Size, (uint)d_idata.Pointer);
            cuda.SetParameter(transpose_naive, IntPtr.Size * 2, (uint)size_x);
            cuda.SetParameter(transpose_naive, IntPtr.Size * 2 + 4, (uint)size_y);
            cuda.SetParameterSize(transpose_naive, (uint)(IntPtr.Size * 2 + 8));

            cuda.SetFunctionBlockShape(transpose, BLOCK_DIM, BLOCK_DIM, 1);
            cuda.SetParameter(transpose, 0, (uint)d_odata.Pointer);
            cuda.SetParameter(transpose, IntPtr.Size, (uint)d_idata.Pointer);
            cuda.SetParameter(transpose, IntPtr.Size * 2, (uint)size_x);
            cuda.SetParameter(transpose, IntPtr.Size * 2 + 4, (uint)size_y);
            cuda.SetParameterSize(transpose, (uint)(IntPtr.Size * 2 + 8));

            // warmup so we don't time CUDA startup
            cuda.Launch(transpose_naive, size_x / BLOCK_DIM, size_y / BLOCK_DIM);
            cuda.Launch(transpose, size_x / BLOCK_DIM, size_y / BLOCK_DIM);
            //System.Threading.Thread.Sleep(10);
            int numIterations = 100;

            Console.WriteLine("Transposing a {0} by {1} matrix of floats...", size_x, size_y);
            CUevent start = cuda.CreateEvent();
            CUevent end = cuda.CreateEvent();
            cuda.RecordEvent(start);
            for (int i = 0; i < numIterations; i++)
            {
                cuda.Launch(transpose_naive, size_x / BLOCK_DIM, size_y / BLOCK_DIM);
            }
            cuda.SynchronizeContext();
            cuda.RecordEvent(end);
            cuda.SynchronizeContext();
            float naiveTime = cuda.ElapsedTime(start, end);
            Console.WriteLine("Naive transpose average time:     {0} ms\n", naiveTime / numIterations);

            cuda.RecordEvent(start);
            for (int i = 0; i < numIterations; i++)
            {
                cuda.Launch(transpose, size_x / BLOCK_DIM, size_y / BLOCK_DIM);
            }
            cuda.SynchronizeContext();
            cuda.RecordEvent(end);
            cuda.SynchronizeContext();
            float optimizedTime = cuda.ElapsedTime(start, end);

            
            Console.WriteLine("Optimized transpose average time:     {0} ms\n", optimizedTime / numIterations);

            float[] h_odata = new float[size_x * size_y];
            cuda.CopyDeviceToHost<float>(d_odata, h_odata);

            float[] reference = new float[size_x * size_y];
            computeGold(reference, h_idata, size_x, size_y);

            bool res = CompareF(reference, h_odata, size_x * size_y);
            Console.WriteLine("Test {0}", res == true? "PASSED":"FAILED");

            cuda.Free(d_idata);
            cuda.Free(d_odata);

            Console.ReadKey();
        }
Example #47
0
        internal static void SetTextureMemory(CUDA cuda, ref CUtexref texture, string texName, float[] data, ref CUdeviceptr memPtr)
        {
            texture = cuda.GetModuleTexture(texName);
            memPtr = cuda.CopyHostToDevice(data);
            cuda.SetTextureAddress(texture, memPtr, (uint)(sizeof(float) * data.Length));

        }
Example #48
0
        private void DisposeCuda()
        {
           
            if (cuda != null)
            {
                //free all resources
                cuda.Free(valsCSRPtr);
                cuda.Free(valsCSCPtr);
                valsCSRPtr.Pointer =IntPtr.Zero;
                valsCSCPtr.Pointer =IntPtr.Zero;

                cuda.Free(idxCSRPtr);
                cuda.Free(idxCSCPtr);
                idxCSRPtr.Pointer =IntPtr.Zero;
                idxCSCPtr.Pointer =IntPtr.Zero;

                cuda.Free(vecLenghtCSRPtr);
                cuda.Free(vecLenghtCSCPtr);
                vecLenghtCSRPtr.Pointer =IntPtr.Zero;
                vecLenghtCSCPtr.Pointer =IntPtr.Zero;



                cuda.Free(gradPtr);
                gradPtr.Pointer =IntPtr.Zero;
                cuda.Free(gradOldPtr);
                gradOldPtr.Pointer =IntPtr.Zero;

                cuda.Free(alphaPtr);
                alphaPtr.Pointer =IntPtr.Zero;
                cuda.Free(alphaTmpPtr);
                alphaTmpPtr.Pointer =IntPtr.Zero;
                cuda.Free(alphaOldPtr);
                alphaOldPtr.Pointer =IntPtr.Zero;

                cuda.Free(wVecPtr);
                wVecPtr.Pointer =IntPtr.Zero;
                cuda.Free(wTempVecPtr);
                wTempVecPtr.Pointer =IntPtr.Zero;


                cuda.Free(reduceBBAlphaPtr);
                reduceBBAlphaPtr.Pointer =IntPtr.Zero;
                cuda.Free(reduceBBGradPtr);
                reduceBBGradPtr.Pointer =IntPtr.Zero;
                cuda.Free(reduceBBAlphaGradPtr);
                reduceBBAlphaGradPtr.Pointer =IntPtr.Zero;

                cuda.Free(reduceObjAlphaPtr);
                reduceObjAlphaPtr.Pointer =IntPtr.Zero;
                cuda.Free(reduceObjWPtr);
                reduceObjWPtr.Pointer =IntPtr.Zero;

                cuda.Free(reduceGradMaxNormPtr);
                reduceGradMaxNormPtr.Pointer =IntPtr.Zero;


                //cuda.Free(diagPtr);
                //diagPtr.Pointer =IntPtr.Zero;
                //cuda.Free(stepBBPtr);
                //stepBBPtr.Pointer =IntPtr.Zero;

                cuda.Free(deltasPtr);
                deltasPtr.Pointer =IntPtr.Zero;
                cuda.DestroyTexture(cuDeltasTexRef);

                cuda.Free(labelsPtr);
                labelsPtr.Pointer =IntPtr.Zero;
                cuda.DestroyTexture(cuLabelsTexRef);

               

                cuda.DestroyTexture(cuWVecTexRef);

                cuda.UnloadModule(cuModule);
                cuda.Dispose();
                cuda = null;
            }
        }
Example #49
0
        //private static void InitMainVector(float[] vecVals, int[] vecIdx, int[] vecLenght, float[] mainVec)
        //{
        //    for (int j = vecLenght[mainIndex]; j < vecLenght[mainIndex + 1]; j++)
        //    {
        //        int idx = vecIdx[j];
        //        float val = vecVals[j];
        //        mainVec[idx] = val;
        //    }
        //}
        private static float[] CuDotProdCSRwriteCombined(int repetition)
        {
            //always the same values
            Random rnd = new Random(1);

            CUDA cuda = new CUDA(0, true);

            // load module
            CUmodule module = cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, "structKernel.cubin"));

            CUfunction cuFunc = cuda.GetModuleFunction("spmv_csr_vector_kernel_wc");

            int maxRowSize = avgElements + stdElements - 1;

            Console.WriteLine("init arrays");
            Stopwatch t = Stopwatch.StartNew();

            //temp lists for values, indices and vecotr lenght
            List<float> vecValsL = new List<float>(N * maxRowSize / 2);
            List<int> vecIdxL = new List<int>(N * maxRowSize / 2);
            List<int> vecLenghtL = new List<int>(N);

            float[] vecVals;
            int[] vecIdx;
            int[] vecLenght;

            maxIndex = 0;
            int vecStartIdx = 0;
            for (int i = 0; i < N; i++)
            {
                int vecSize = avgElements + i % stdElements;

                float[] vals = Helpers.InitValues(i, vecSize, maxVal);
                vecValsL.AddRange(vals);

                int[] index = Helpers.InitIndices(i, vecSize, ref maxIndex);
                vecIdxL.AddRange(index);

                vecLenghtL.Add(vecStartIdx);
                vecStartIdx += vecSize;

            }
            //for last index
            vecLenghtL.Add(vecStartIdx);

            vecVals = vecValsL.ToArray();
            vecIdx = vecIdxL.ToArray();
            vecLenght = vecLenghtL.ToArray();

            Console.WriteLine("Init takes {0}", t.Elapsed);
            t.Start();

            CUdeviceptr valsPtr = cuda.CopyHostToDevice(vecVals);
            CUdeviceptr idxPtr = cuda.CopyHostToDevice(vecIdx);
            CUdeviceptr vecLenghtPtr = cuda.CopyHostToDevice(vecLenght);

            float[] output = new float[N];
            //CUdeviceptr dOutput = cuda.Allocate(output);

            IntPtr outputPtr2 = cuda.HostAllocate((uint)(N * sizeof(float)), CUDADriver.CU_MEMHOSTALLOC_DEVICEMAP);
            CUdeviceptr dOutput = cuda.GetHostDevicePointer(outputPtr2, 0);

            uint memSize = (uint)((maxIndex + 1) * sizeof(float));
            uint flags = CUDADriver.CU_MEMHOSTALLOC_DEVICEMAP | CUDADriver.CU_MEMHOSTALLOC_WRITECOMBINED;
            uint tt = (uint)CUMemHostAllocFlags.WriteCombined;
            uint s = (uint)CUMemHostAllocFlags.DeviceMap;
            IntPtr mainVecIntPtr = cuda.HostAllocate(memSize, flags);

            CUdeviceptr mainVecPtr = cuda.GetHostDevicePointer(mainVecIntPtr, 0);

            Console.WriteLine("copy to device takes {0}", t.Elapsed);
            #region set cuda parameters
            cuda.SetFunctionBlockShape(cuFunc, threadsPerBlock, 1, 1);

            int offset = 0;
            cuda.SetParameter(cuFunc, offset, valsPtr.Pointer);
            offset += IntPtr.Size;
            cuda.SetParameter(cuFunc, offset, idxPtr.Pointer);
            offset += IntPtr.Size;

            cuda.SetParameter(cuFunc, offset, vecLenghtPtr.Pointer);
            offset += IntPtr.Size;

            cuda.SetParameter(cuFunc, offset, mainVecPtr.Pointer);
            offset += IntPtr.Size;

            cuda.SetParameter(cuFunc, offset, dOutput.Pointer);
            offset += IntPtr.Size;

            cuda.SetParameter(cuFunc, offset, (uint)N);
            offset += sizeof(int);
            cuda.SetParameter(cuFunc, offset, (uint)vecStartIdx);
            offset += sizeof(int);
            cuda.SetParameterSize(cuFunc, (uint)offset);
            #endregion
            Console.WriteLine("start computation");

            CUevent start = cuda.CreateEvent();
            CUevent end = cuda.CreateEvent();

            mainIndex = StartingIndex;
            Stopwatch timer = Stopwatch.StartNew();
            cuda.RecordEvent(start);

            for (int k = 0; k < repetition; k++)
            {

                //float[] tempFloatarr = new float[memSize];
               Helpers.InitBuffer(vecVals, vecIdx, vecLenght,mainIndex, mainVecIntPtr);

                //Marshal.Copy(mainVecIntPtr, tempFloatarr, 0, tempFloatarr.Length);

                cuda.Launch(cuFunc, blocksPerGrid, 1);

                cuda.SynchronizeContext();
                //cuda.CopyDeviceToHost(dOutput, output);
                Marshal.Copy(outputPtr2, output, 0, N);

                //mainVec = new float[maxIndex + 1];
                //Array.Clear(mainVec, 0, mainVec.Length);

                //clear previous vector values

                Helpers.SetBufferIdx(vecIdx, vecLenght,mainIndex, mainVecIntPtr,0.0f);
                mainIndex++;

            }

            cuda.RecordEvent(end);

            cuda.SynchronizeContext();
            //cuda.SynchronizeEvent(end);

            // cuda.CopyDeviceToHost(dOutput, output);

            timer.Stop();
            float naiveTime = cuda.ElapsedTime(start, end);

            Console.Write("csr vector Dot products with mainIndex {0} and {1}-vectors takes {2} ms stopwatch time {3} ms", mainIndex, N, naiveTime, timer.Elapsed);

            int lenght = Math.Min(displayCount, N);
            Console.WriteLine();
            for (int i = 0; i < lenght; i++)
            {
                Console.WriteLine("{0}-{1}", i, output[i]);
            }

            cuda.Free(valsPtr);
            cuda.Free(idxPtr);
            cuda.Free(dOutput);
            cuda.Free(vecLenghtPtr);
            //cuda.DestroyArray(cuArr);
            cuda.Free(mainVecPtr);
            //cuda.DestroyTexture(cuTexRef);

               // cuda.Free(mainVecPtr);
            cuda.DestroyEvent(start);
            cuda.DestroyEvent(end);

            return output;
        }
Example #50
0
        /// <summary>
        /// Dispose all object used by CUDA
        /// </summary>
        private void DisposeCuda()
        {
            if (cuda != null)
            {
                //free all resources
                cuda.Free(valsCSRPtr);
                cuda.Free(valsCSCPtr);
                valsCSRPtr.Pointer =IntPtr.Zero;
                valsCSCPtr.Pointer =IntPtr.Zero;

                cuda.Free(idxCSRPtr);
                cuda.Free(idxCSCPtr);
                idxCSRPtr.Pointer =IntPtr.Zero;
                idxCSCPtr.Pointer =IntPtr.Zero;

                cuda.Free(vecLenghtCSRPtr);
                cuda.Free(vecLenghtCSCPtr);
                vecLenghtCSRPtr.Pointer =IntPtr.Zero;
                vecLenghtCSCPtr.Pointer =IntPtr.Zero;



                cuda.Free(qdPtr);
                qdPtr.Pointer =IntPtr.Zero;
                //  cuda.Free(diagPtr);
                diagPtr.Pointer =IntPtr.Zero;
                cuda.Free(alphaPtr);
                alphaPtr.Pointer =IntPtr.Zero;
                cuda.Free(gradPtr);
                gradPtr.Pointer =IntPtr.Zero;

                cuda.Free(deltasPtr);
                deltasPtr.Pointer =IntPtr.Zero;
                cuda.DestroyTexture(cuDeltasTexRef);

                cuda.Free(labelsPtr);
                labelsPtr.Pointer =IntPtr.Zero;
                cuda.DestroyTexture(cuLabelsTexRef);

                cuda.Free(mainVecPtr);
                mainVecPtr.Pointer =IntPtr.Zero;

                cuda.DestroyTexture(cuMainVecTexRef);

                cuda.UnloadModule(cuModule);
                cuda.Dispose();
                cuda = null;
            }

        }
Example #51
0
        private void InitCudaModule()
        {
            cuda = new CUDA(0, true);
            cuModule = cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, cudaModuleName));

            cuFuncDotProd = cuda.GetModuleFunction(cudaProductKernelName);

            cuFuncGradFinalize = cuda.GetModuleFunction(cudaGradFinalizeName);

            cuFuncComputeBBstep = cuda.GetModuleFunction(cudaComputeBBStepName);

            cuFuncObjSquareW = cuda.GetModuleFunction(cudaObjWName);
            cuFuncObjSquareAlpha = cuda.GetModuleFunction(cudaObjAlphaName);

            cuFuncUpdateW = cuda.GetModuleFunction(cudaUpdateW);

            cuFuncUpdateAlpha = cuda.GetModuleFunction(cudaUpdateAlphaName);

            cuFuncMaxNorm = cuda.GetModuleFunction(cudaMaxNormName);
        }
Example #52
0
        private static unsafe float[] CuDotProdSparseVecStruct()
        {
            int sparseVecSize = sizeof(SparseVecPtr);

            uint size = (uint)(N * sizeof(SparseVecPtr));

            //always the same values
            Random rnd = new Random(1);

            CUDA cuda = new CUDA(0, true);

            // load module
            cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, "structKernel.cubin"));
            //CUfunction structPassFunc = cuda.GetModuleFunction("DotProd");
            CUfunction structPassFunc = cuda.GetModuleFunction("DotProd2");

            SparseVecPtr[] vectors = new SparseVecPtr[N];
            Console.WriteLine("init and copy data");
            Stopwatch t = Stopwatch.StartNew();
            mainIndex = StartingIndex;
            for (int i = 0; i < N; i++)
            {
                vectors[i] = new SparseVecPtr();

                int vecSize = avgElements + i % stdElements;
                vectors[i].size = vecSize;
                float[] vals = Helpers.InitValues(i, vecSize, maxVal);

                int[] index = Helpers.InitIndices(i, vecSize, ref maxIndex);

                CUdeviceptr valsPtr = cuda.CopyHostToDevice(vals);
                CUdeviceptr idxPtr = cuda.CopyHostToDevice(index);

                vectors[i].indices = new IntPtr(idxPtr.Pointer);
                vectors[i].values = (IntPtr)valsPtr.Pointer;
            }

            GCHandle handle = GCHandle.Alloc(vectors, GCHandleType.Pinned);
            IntPtr ptr = handle.AddrOfPinnedObject();

            float[] output = new float[N];

            //CUdeviceptr dVectors = cuda.CopyHostToDevice(vectors);

            CUdeviceptr dVectors = cuda.CopyHostToDevice(ptr, size);
            CUdeviceptr dOutput = cuda.Allocate(output);

            Console.WriteLine("copy and init takes {0}", t.Elapsed);
            #region set cuda parameters
            cuda.SetFunctionBlockShape(structPassFunc, threadsPerBlock, 1, 1);

            int offset = 0;
            cuda.SetParameter(structPassFunc, offset, dVectors.Pointer);
            offset += IntPtr.Size;
            cuda.SetParameter(structPassFunc, offset, dOutput.Pointer);
            offset += IntPtr.Size;
            cuda.SetParameter(structPassFunc, offset, (uint)mainIndex);
            offset += sizeof(int);
            cuda.SetParameter(structPassFunc, offset, (uint)N);
            offset += sizeof(int);
            cuda.SetParameterSize(structPassFunc, (uint)offset);
            #endregion
            Console.WriteLine("start computation");

            CUevent start = cuda.CreateEvent();
            CUevent end = cuda.CreateEvent();

            Stopwatch timer = Stopwatch.StartNew();
            cuda.RecordEvent(start);

            cuda.Launch(structPassFunc, blocksPerGrid, 1);

            cuda.RecordEvent(end);

            cuda.SynchronizeContext();
            //cuda.SynchronizeEvent(end);
            timer.Stop();
            float naiveTime = cuda.ElapsedTime(start, end);

            Console.Write("Dot products with mainIndex {0} and {1}-vectors takes {2} ms stopwatch time {3} ms", mainIndex, N, naiveTime, timer.Elapsed);

            cuda.CopyDeviceToHost(dOutput, output);

            int lenght = Math.Min(displayCount, N);
            Console.WriteLine();
            for (int i = 0; i < lenght; i++)
            {
                Console.WriteLine("{0}-{1}", i, output[i]);
            }

            cuda.Free(dVectors);
            cuda.Free(dOutput);

            return output;
        }
Example #53
0
        private static void DetailCudaDriver()
        {
            CUDA cuda = new CUDA(false);

            cuda.Init();

            int cudaDrv = cuda.GetDeviceCount();

            if (cudaDrv < 1)
            {
                Console.WriteLine("Cuda device not found");
                System.Environment.Exit(-1);
            }

            Console.WriteLine("Found {0} cuda devices", cudaDrv);
            Device[] cuDevice = cuda.Devices;

            for (int i = 0; i < cuDevice.Length; i++)
            {
                Console.WriteLine("-------------------");
                Console.WriteLine("Cuda device nr {0} details:", i + 1);
                Console.WriteLine("Name: {0}", cuDevice[i].Name);
                Console.WriteLine("Compute: {0}", cuDevice[i].ComputeCapability);

                DeviceProperties prop = cuDevice[i].Properties;

                int processors = cuda.GetDeviceAttribute(CUDeviceAttribute.MultiProcessorCount, cuDevice[i].Handle);
                Console.WriteLine("Clock rate: {0}", prop.ClockRate);
                Console.WriteLine("Number of processors: {0}", processors);
                Console.WriteLine("Memory: {0} GB", (cuDevice[i].TotalMemory + 0.0) / (1024 * 1024));
                Console.WriteLine("Constant Memory: {0}MB", (prop.TotalConstantMemory + 0.0) / 1024);

            }
            Console.WriteLine("----------------------------------");
            Console.WriteLine();
        }
Example #54
0
        static void Main(string[] args)
        {
            // Init and select 1st device.
            CUDA cuda = new CUDA(0, true);

            // load module
            //cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, "transpose_kernel.cubin"));
            cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, "transpose_kernel.ptx"));
            CUfunction transpose       = cuda.GetModuleFunction("transpose");
            CUfunction transpose_naive = cuda.GetModuleFunction("transpose_naive");

            const int size_x   = 4096;
            const int size_y   = 4096;
            const int mem_size = sizeof(float) * size_x * size_y;

            float[] h_idata = new float[size_x * size_y];
            for (int i = 0; i < h_idata.Length; i++)
            {
                h_idata[i] = (float)i;
            }

            // allocate device memory
            // copy host memory to device
            CUdeviceptr d_idata = cuda.CopyHostToDevice <float>(h_idata);
            CUdeviceptr d_odata = cuda.Allocate <float>(h_idata);

            // setup execution parameters
            cuda.SetFunctionBlockShape(transpose_naive, BLOCK_DIM, BLOCK_DIM, 1);
            cuda.SetParameter(transpose_naive, 0, (uint)d_odata.Pointer);
            cuda.SetParameter(transpose_naive, IntPtr.Size, (uint)d_idata.Pointer);
            cuda.SetParameter(transpose_naive, IntPtr.Size * 2, (uint)size_x);
            cuda.SetParameter(transpose_naive, IntPtr.Size * 2 + 4, (uint)size_y);
            cuda.SetParameterSize(transpose_naive, (uint)(IntPtr.Size * 2 + 8));

            cuda.SetFunctionBlockShape(transpose, BLOCK_DIM, BLOCK_DIM, 1);
            cuda.SetParameter(transpose, 0, (uint)d_odata.Pointer);
            cuda.SetParameter(transpose, IntPtr.Size, (uint)d_idata.Pointer);
            cuda.SetParameter(transpose, IntPtr.Size * 2, (uint)size_x);
            cuda.SetParameter(transpose, IntPtr.Size * 2 + 4, (uint)size_y);
            cuda.SetParameterSize(transpose, (uint)(IntPtr.Size * 2 + 8));

            // warmup so we don't time CUDA startup
            cuda.Launch(transpose_naive, size_x / BLOCK_DIM, size_y / BLOCK_DIM);
            cuda.Launch(transpose, size_x / BLOCK_DIM, size_y / BLOCK_DIM);
            //System.Threading.Thread.Sleep(10);
            int numIterations = 100;

            Console.WriteLine("Transposing a {0} by {1} matrix of floats...", size_x, size_y);
            CUevent start = cuda.CreateEvent();
            CUevent end   = cuda.CreateEvent();

            cuda.RecordEvent(start);
            for (int i = 0; i < numIterations; i++)
            {
                cuda.Launch(transpose_naive, size_x / BLOCK_DIM, size_y / BLOCK_DIM);
            }
            cuda.SynchronizeContext();
            cuda.RecordEvent(end);
            cuda.SynchronizeContext();
            float naiveTime = cuda.ElapsedTime(start, end);

            Console.WriteLine("Naive transpose average time:     {0} ms\n", naiveTime / numIterations);

            cuda.RecordEvent(start);
            for (int i = 0; i < numIterations; i++)
            {
                cuda.Launch(transpose, size_x / BLOCK_DIM, size_y / BLOCK_DIM);
            }
            cuda.SynchronizeContext();
            cuda.RecordEvent(end);
            cuda.SynchronizeContext();
            float optimizedTime = cuda.ElapsedTime(start, end);


            Console.WriteLine("Optimized transpose average time:     {0} ms\n", optimizedTime / numIterations);

            float[] h_odata = new float[size_x * size_y];
            cuda.CopyDeviceToHost <float>(d_odata, h_odata);

            float[] reference = new float[size_x * size_y];
            computeGold(reference, h_idata, size_x, size_y);

            bool res = CompareF(reference, h_odata, size_x * size_y);

            Console.WriteLine("Test {0}", res == true? "PASSED":"FAILED");

            cuda.Free(d_idata);
            cuda.Free(d_odata);

            Console.ReadKey();
        }
		unsafe public FlaCudaTask(CUDA _cuda, int channelCount, int channels, uint bits_per_sample, int max_frame_size, bool do_verify)
		{
			cuda = _cuda;

			residualTasksLen = sizeof(FlaCudaSubframeTask) * channelCount * (lpc.MAX_LPC_ORDER * lpc.MAX_LPC_WINDOWS + 8) * FlaCudaWriter.maxFrames;
			bestResidualTasksLen = sizeof(FlaCudaSubframeTask) * channelCount * FlaCudaWriter.maxFrames;
			samplesBufferLen = sizeof(int) * FlaCudaWriter.MAX_BLOCKSIZE * channelCount;
			int partitionsLen = sizeof(int) * (30 << 8) * channelCount * FlaCudaWriter.maxFrames;
			int riceParamsLen = sizeof(int) * (4 << 8) * channelCount * FlaCudaWriter.maxFrames;
			int lpcDataLen = sizeof(float) * 32 * 33 * lpc.MAX_LPC_WINDOWS * channelCount * FlaCudaWriter.maxFrames;

			cudaSamplesBytes = cuda.Allocate((uint)samplesBufferLen / 2);
			cudaSamples = cuda.Allocate((uint)samplesBufferLen);
			cudaResidual = cuda.Allocate((uint)samplesBufferLen);
			cudaLPCData = cuda.Allocate((uint)lpcDataLen);
			cudaPartitions = cuda.Allocate((uint)partitionsLen);
			cudaRiceParams = cuda.Allocate((uint)riceParamsLen);
			cudaBestRiceParams = cuda.Allocate((uint)riceParamsLen / 4);
			cudaAutocorOutput = cuda.Allocate((uint)(sizeof(float) * channelCount * lpc.MAX_LPC_WINDOWS * (lpc.MAX_LPC_ORDER + 1) * (FlaCudaWriter.maxAutocorParts + FlaCudaWriter.maxFrames)));
			cudaResidualTasks = cuda.Allocate((uint)residualTasksLen);
			cudaBestResidualTasks = cuda.Allocate((uint)bestResidualTasksLen);
			cudaResidualOutput = cuda.Allocate((uint)(sizeof(int) * channelCount * (lpc.MAX_LPC_WINDOWS * lpc.MAX_LPC_ORDER + 8) * 64 /*FlaCudaWriter.maxResidualParts*/ * FlaCudaWriter.maxFrames));
			CUResult cuErr = CUResult.Success;
			if (cuErr == CUResult.Success)
				cuErr = CUDADriver.cuMemAllocHost(ref samplesBytesPtr, (uint)samplesBufferLen/2);
			if (cuErr == CUResult.Success)
				cuErr = CUDADriver.cuMemAllocHost(ref residualBufferPtr, (uint)samplesBufferLen);
			if (cuErr == CUResult.Success)
				cuErr = CUDADriver.cuMemAllocHost(ref bestRiceParamsPtr, (uint)riceParamsLen / 4);
			if (cuErr == CUResult.Success)
				cuErr = CUDADriver.cuMemAllocHost(ref residualTasksPtr, (uint)residualTasksLen);
			if (cuErr == CUResult.Success)
				cuErr = CUDADriver.cuMemAllocHost(ref bestResidualTasksPtr, (uint)bestResidualTasksLen);
			if (cuErr != CUResult.Success)
			{
				if (samplesBytesPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(samplesBytesPtr); samplesBytesPtr = IntPtr.Zero;
				if (residualBufferPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(residualBufferPtr); residualBufferPtr = IntPtr.Zero;
				if (bestRiceParamsPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(bestRiceParamsPtr); bestRiceParamsPtr = IntPtr.Zero;
				if (residualTasksPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(residualTasksPtr); residualTasksPtr = IntPtr.Zero;
				if (bestResidualTasksPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(bestResidualTasksPtr); bestResidualTasksPtr = IntPtr.Zero;
				throw new CUDAException(cuErr);
			}

			cudaComputeAutocor = cuda.GetModuleFunction("cudaComputeAutocor");
			cudaStereoDecorr = cuda.GetModuleFunction("cudaStereoDecorr");
			cudaChannelDecorr = cuda.GetModuleFunction("cudaChannelDecorr");
			cudaChannelDecorr2 = cuda.GetModuleFunction("cudaChannelDecorr2");
			cudaFindWastedBits = cuda.GetModuleFunction("cudaFindWastedBits");
			cudaComputeLPC = cuda.GetModuleFunction("cudaComputeLPC");
			cudaQuantizeLPC = cuda.GetModuleFunction("cudaQuantizeLPC");
			cudaComputeLPCLattice = cuda.GetModuleFunction("cudaComputeLPCLattice");
			cudaEstimateResidual = cuda.GetModuleFunction("cudaEstimateResidual");
			cudaEstimateResidual8 = cuda.GetModuleFunction("cudaEstimateResidual8");
			cudaEstimateResidual12 = cuda.GetModuleFunction("cudaEstimateResidual12");
			cudaEstimateResidual1 = cuda.GetModuleFunction("cudaEstimateResidual1");
			cudaChooseBestMethod = cuda.GetModuleFunction("cudaChooseBestMethod");
			cudaCopyBestMethod = cuda.GetModuleFunction("cudaCopyBestMethod");
			cudaCopyBestMethodStereo = cuda.GetModuleFunction("cudaCopyBestMethodStereo");
			cudaEncodeResidual = cuda.GetModuleFunction("cudaEncodeResidual");
			cudaCalcPartition = cuda.GetModuleFunction("cudaCalcPartition");
			cudaCalcPartition16 = cuda.GetModuleFunction("cudaCalcPartition16");
			cudaCalcLargePartition = cuda.GetModuleFunction("cudaCalcLargePartition");
			cudaSumPartition = cuda.GetModuleFunction("cudaSumPartition");
			cudaFindRiceParameter = cuda.GetModuleFunction("cudaFindRiceParameter");
			cudaFindPartitionOrder = cuda.GetModuleFunction("cudaFindPartitionOrder");

			stream = cuda.CreateStream();
			samplesBuffer = new int[FlaCudaWriter.MAX_BLOCKSIZE * channelCount];
			outputBuffer = new byte[max_frame_size * FlaCudaWriter.maxFrames + 1];
			frame = new FlacFrame(channelCount);
			frame.writer = new BitWriter(outputBuffer, 0, outputBuffer.Length);

			if (do_verify)
			{
				verify = new FlakeReader(new AudioPCMConfig((int)bits_per_sample, channels, 44100));
				verify.DoCRC = false;
			}
		}
		public unsafe void InitTasks()
		{
			bool doMidside = channels == 2 && eparams.do_midside;
			int channelCount = doMidside ? 2 * channels : channels;

			if (!inited)
			{
				cuda = new CUDA(true, InitializationFlags.None);
				cuda.CreateContext(0, CUCtxFlags.SchedAuto);
				using (Stream cubin = GetType().Assembly.GetManifestResourceStream(GetType(), "flacuda.cubin"))
				using (StreamReader sr = new StreamReader(cubin))
					cuda.LoadModule(new ASCIIEncoding().GetBytes(sr.ReadToEnd()));
				//cuda.LoadModule(System.IO.Path.Combine(Environment.CurrentDirectory, "flacuda.cubin"));
				if (_IO == null)
					_IO = new FileStream(_path, FileMode.Create, FileAccess.Write, FileShare.Read);
				int header_size = flake_encode_init();
				_IO.Write(header, 0, header_size);
				if (_IO.CanSeek)
					first_frame_offset = _IO.Position;

				task1 = new FlaCudaTask(cuda, channelCount, channels, bits_per_sample, max_frame_size, _settings.DoVerify);
				task2 = new FlaCudaTask(cuda, channelCount, channels, bits_per_sample, max_frame_size, _settings.DoVerify);
				if (_settings.CPUThreads > 0)
				{
					cpu_tasks = new FlaCudaTask[_settings.CPUThreads];
					for (int i = 0; i < cpu_tasks.Length; i++)
						cpu_tasks[i] = new FlaCudaTask(cuda, channelCount, channels, bits_per_sample, max_frame_size, _settings.DoVerify);
				}
				cudaWindow = cuda.Allocate((uint)sizeof(float) * FlaCudaWriter.MAX_BLOCKSIZE * 2 * lpc.MAX_LPC_WINDOWS);

				inited = true;
			}
		}