Пример #1
0
        public override void Init()
        {
            base.Init();

            float[] vecVals;
            int[]   vecColIdx;
            int[]   vecLenght;


            //change the blocksPerGrid, because we launch many threads per row
            blocksPerGrid = (int)Math.Ceiling((ThreadsPerRow * problemElements.Length + 0.0) / threadsPerBlock);


            int align = ThreadsPerRow * Prefetch;

            CudaHelpers.TransformToERTILPFormat(out vecVals, out vecColIdx, out vecLenght, problemElements, align, ThreadsPerRow);


            selfSum = problemElements.AsParallel().Select(x => x.Values.Sum()).ToArray();


            #region cuda initialization

            InitCudaModule();

            //copy data to device, set cuda function parameters
            valsPtr      = cuda.CopyHostToDevice(vecVals);
            idxPtr       = cuda.CopyHostToDevice(vecColIdx);
            vecLengthPtr = cuda.CopyHostToDevice(vecLenght);


            selfSumPtr = cuda.CopyHostToDevice(selfSum);

            uint memSize = (uint)(problemElements.Length * sizeof(float));

            outputIntPtr = cuda.HostAllocate(memSize, CUDADriver.CU_MEMHOSTALLOC_DEVICEMAP);
            outputPtr    = cuda.GetHostDevicePointer(outputIntPtr, 0);
            //normal memory allocation
            //outputPtr = cuda.Allocate((uint)(sizeof(float) * problemElements.Length));


            #endregion

            SetCudaFunctionParameters();

            //allocate memory for main vector, size of this vector is the same as dimension, so many
            //indexes will be zero, but cuda computation is faster
            mainVector = new float[problemElements[0].Dim + 1];
            CudaHelpers.FillDenseVector(problemElements[0], mainVector);

            CudaHelpers.SetTextureMemory(cuda, cuModule, ref cuMainVecTexRef, cudaMainVecTexRefName, mainVector, ref mainVecPtr);

            CudaHelpers.SetTextureMemory(cuda, cuModule, ref cuLabelsTexRef, cudaLabelsTexRefName, Y, ref labelsPtr);

            if (MakeDenseVectorOnGPU)
            {
                vecBuilder = new EllpackDenseVectorBuilder(cuda, mainVecPtr, valsPtr, idxPtr, vecLengthPtr, problemElements.Length, problemElements[0].Dim);
                vecBuilder.Init();
            }
        }
Пример #2
0
        public override void Init()
        {
            base.Init();

            blockSize = threadsPerRow * sliceSize;
            int N = problemElements.Length;
            blocksPerGrid = (int)Math.Ceiling(1.0 * N * threadsPerRow / blockSize);

            align = (int)Math.Ceiling(1.0 * sliceSize * threadsPerRow / 64) * 64;
            

            float[] vecVals;
            int[] vecColIdx;
            int[] vecLenght;
            int[] sliceStart;

            CudaHelpers.TransformToSERTILP(out vecVals, out vecColIdx, out sliceStart, out vecLenght, problemElements, threadsPerRow, sliceSize,preFechSize);

            selfSum = problemElements.AsParallel().Select(x => x.Values.Sum()).ToArray();

            #region cuda initialization

            InitCudaModule();

            //copy data to device, set cuda function parameters
            valsPtr = cuda.CopyHostToDevice(vecVals);
            idxPtr = cuda.CopyHostToDevice(vecColIdx);
            vecLengthPtr = cuda.CopyHostToDevice(vecLenght);
            sliceStartPtr = cuda.CopyHostToDevice(sliceStart);
            
            labelsPtr = cuda.CopyHostToDevice(Y);

            selfSumPtr = cuda.CopyHostToDevice(selfSum);

            uint memSize = (uint)(problemElements.Length * sizeof(float));
            
            outputIntPtr = cuda.HostAllocate(memSize,CUDADriver.CU_MEMHOSTALLOC_DEVICEMAP);
            outputPtr = cuda.GetHostDevicePointer(outputIntPtr, 0);

            //normal memory allocation
            //outputPtr = cuda.Allocate((uint)(sizeof(float) * problemElements.Length));


            #endregion

            SetCudaFunctionParameters();

            //allocate memory for main vector, size of this vector is the same as dimension, so many 
            //indexes will be zero, but cuda computation is faster
            mainVector = new float[problemElements[0].Dim + 1];
            CudaHelpers.FillDenseVector(problemElements[0], mainVector);

            CudaHelpers.SetTextureMemory(cuda,cuModule,ref cuMainVecTexRef, cudaMainVecTexRefName, mainVector, ref mainVecPtr);

           // CudaHelpers.SetTextureMemory(cuda,cuModule,ref cuLabelsTexRef, cudaLabelsTexRefName, Y, ref labelsPtr);


        }
Пример #3
0
        public override void Init()
        {
            //it's not necessary to init linear kernel its used only for computing element product
            //linKernel.ProblemElements = problemElements;
            //linKernel.Labels = Labels;
            //linKernel.Init();

            base.Init();


            float[] vecVals;
            int[]   vecIdx;
            int[]   vecLenght;
            CudaHelpers.TransformToCSRFormat(out vecVals, out vecIdx, out vecLenght, problemElements);

            #region cuda initialization

            InitCudaModule();

            //copy data to device, set cuda function parameters
            valsPtr      = cuda.CopyHostToDevice(vecVals);
            idxPtr       = cuda.CopyHostToDevice(vecIdx);
            vecLengthPtr = cuda.CopyHostToDevice(vecLenght);

            uint memSize = (uint)(problemElements.Length * sizeof(float));
            //allocate mapped memory for our results
            outputIntPtr = cuda.HostAllocate(memSize, CUDADriver.CU_MEMHOSTALLOC_DEVICEMAP);
            outputPtr    = cuda.GetHostDevicePointer(outputIntPtr, 0);



            #endregion
            SetCudaFunctionParameters();


            //allocate memory for main vector, size of this vector is the same as dimenson, so many
            //indexes will be zero, but cuda computation is faster



            mainVector = new float[problemElements[0].Dim + 1];
            CudaHelpers.FillDenseVector(problemElements[0], mainVector);

            //get reference to cuda texture for main vector
            //cuMainVecTexRef = cuda.GetModuleTexture(cuModule, cudaMainVecTexRefName);
            //mainVecPtr = cuda.CopyHostToDevice(mainVector);
            //cuda.SetTextureAddress(cuMainVecTexRef, mainVecPtr, (uint)(sizeof(float) * mainVector.Length));

            CudaHelpers.SetTextureMemory(cuda, cuModule, ref cuMainVecTexRef, cudaMainVecTexRefName, mainVector, ref mainVecPtr);

            //cuLabelsTexRef = cuda.GetModuleTexture(cuModule, cudaLabelsTexRefName);
            //labelsPtr = cuda.CopyHostToDevice(Labels);
            //uint align = cuda.SetTextureAddress(cuLabelsTexRef, labelsPtr, (uint)(sizeof(float) * Labels.Length));

            CudaHelpers.SetTextureMemory(cuda, cuModule, ref cuLabelsTexRef, cudaLabelsTexRefName, Y, ref labelsPtr);
        }
Пример #4
0
        public virtual void SetMemoryForDenseVector(int mainIndex)
        {
            SparseVec mainVec = problemElements[mainIndex];

            if (mainVectorIdx != mainIndex)
            {
                CudaHelpers.FillDenseVector(mainVec, mainVector);
                cuda.CopyHostToDevice(mainVecPtr, mainVector);
            }
        }
Пример #5
0
        public override void Init()
        {
            linKernel.ProblemElements = problemElements;
            linKernel.Y = Y;
            linKernel.Init();

            base.Init();

            float[] vecVals;
            int[]   vecIdx;
            int[]   vecLenght;
            CudaHelpers.TransformToCSRFormat(out vecVals, out vecIdx, out vecLenght, problemElements);


            selfLinDot = linKernel.DiagonalDotCache;

            #region cuda initialization

            InitCudaModule();

            //copy data to device, set cuda function parameters
            valsPtr      = cuda.CopyHostToDevice(vecVals);
            idxPtr       = cuda.CopyHostToDevice(vecIdx);
            vecLengthPtr = cuda.CopyHostToDevice(vecLenght);


            //!!!!!
            selfLinDotPtr = cuda.CopyHostToDevice(selfLinDot);

            uint memSize = (uint)(problemElements.Length * sizeof(float));
            //allocate mapped memory for our results
            outputIntPtr = cuda.HostAllocate(memSize, CUDADriver.CU_MEMHOSTALLOC_DEVICEMAP);
            outputPtr    = cuda.GetHostDevicePointer(outputIntPtr, 0);

            //normal memory allocation
            //outputPtr = cuda.Allocate((uint)(sizeof(float) * problemElements.Length));


            #endregion

            SetCudaFunctionParameters();

            //allocate memory for main vector, size of this vector is the same as dimension, so many
            //indexes will be zero, but cuda computation is faster
            mainVector = new float[problemElements[0].Dim + 1];
            CudaHelpers.FillDenseVector(problemElements[0], mainVector);

            CudaHelpers.SetTextureMemory(cuda, cuModule, ref cuMainVecTexRef, cudaMainVecTexRefName, mainVector, ref mainVecPtr);

            CudaHelpers.SetTextureMemory(cuda, cuModule, ref cuLabelsTexRef, cudaLabelsTexRefName, Y, ref labelsPtr);
        }
Пример #6
0
        public override void Init()
        {
            base.Init();

            float[] vecVals;
            int[]   vecColIdx;
            int[]   vecLenght;

            CudaHelpers.TransformToEllpackRFormat(out vecVals, out vecColIdx, out vecLenght, problemElements);

            selfSum = problemElements.AsParallel().Select(x => x.Values.Sum()).ToArray();

            #region cuda initialization

            InitCudaModule();

            //copy data to device, set cuda function parameters
            valsPtr      = cuda.CopyHostToDevice(vecVals);
            idxPtr       = cuda.CopyHostToDevice(vecColIdx);
            vecLengthPtr = cuda.CopyHostToDevice(vecLenght);


            selfSumPtr = cuda.CopyHostToDevice(selfSum);

            uint memSize = (uint)(problemElements.Length * sizeof(float));
            //allocate mapped memory for our results
            //CUDARuntime.cudaSetDeviceFlags(CUDARuntime.cudaDeviceMapHost);

            outputIntPtr = cuda.HostAllocate(memSize, CUDADriver.CU_MEMHOSTALLOC_DEVICEMAP);
            outputPtr    = cuda.GetHostDevicePointer(outputIntPtr, 0);

            #endregion

            SetCudaFunctionParameters();

            //allocate memory for main vector, size of this vector is the same as dimenson, so many
            //indexes will be zero, but cuda computation is faster
            mainVector = new float[problemElements[0].Dim + 1];
            CudaHelpers.FillDenseVector(problemElements[0], mainVector);

            CudaHelpers.SetTextureMemory(cuda, cuModule, ref cuMainVecTexRef, cudaMainVecTexRefName, mainVector, ref mainVecPtr);

            CudaHelpers.SetTextureMemory(cuda, cuModule, ref cuLabelsTexRef, cudaLabelsTexRefName, Y, ref labelsPtr);

            if (MakeDenseVectorOnGPU)
            {
                vecBuilder = new EllpackDenseVectorBuilder(cuda, mainVecPtr, valsPtr, idxPtr, vecLengthPtr, problemElements.Length, problemElements[0].Dim);
                vecBuilder.Init();
            }
        }
        public override void AllProducts(int element1, float[] results)
        {
            //base.AllProducts(element1, results);
            var mainVec = problemElements[element1];

            if (mainVecIdx != element1)
            {
                CudaHelpers.FillDenseVector(mainVec, mainVector);

                cuGPU.CopyHostToDevice(mainVectorPtr, mainVector);
            }

            mainVecIdx = element1;

            //float elapsed;
            //gpu.StartTimer();
            gpu.Launch(blockPerGrid, blockSize, cudaFunctionName, valsPtr, idxPtr, vecLenghtPtr, sliceStartPtr, selfLinDotPtr, labelsPtr, outputPtr, mainVecIdx, problemElements.Length, Gamma, align);

            //elapsed = gpu.StopTimer();
            gpu.Synchronize();

            //elapsed = gpu.StopTimer();
            Marshal.Copy(outputIntPtr, results, 0, results.Length);
        }
Пример #8
0
        public override void Init()
        {
            linKernel.ProblemElements = problemElements;
            linKernel.Y = Y;
            linKernel.Init();

            base.Init();

            float[] vecVals;
            int[]   vecColIdx;
            int[]   vecLenght;

            CudaHelpers.TransformToEllpackRFormat(out vecVals, out vecColIdx, out vecLenght, problemElements);

            selfLinDot = linKernel.DiagonalDotCache;

            #region cuda initialization

            InitCudaModule();

            //copy data to device, set cuda function parameters
            valsPtr      = cuda.CopyHostToDevice(vecVals);
            idxPtr       = cuda.CopyHostToDevice(vecColIdx);
            vecLengthPtr = cuda.CopyHostToDevice(vecLenght);


            selfLinDotPtr = cuda.CopyHostToDevice(selfLinDot);

            uint memSize = (uint)(problemElements.Length * sizeof(float));
            //allocate mapped memory for our results
            //CUDARuntime.cudaSetDeviceFlags(CUDARuntime.cudaDeviceMapHost);



            // var e= CUDADriver.cuMemHostAlloc(ref outputIntPtr, memSize, 8);
            //CUDARuntime.cudaHostAlloc(ref outputIntPtr, memSize, CUDARuntime.cudaHostAllocMapped);
            //var errMsg=CUDARuntime.cudaGetErrorString(e);
            //cuda.HostRegister(outputIntPtr,memSize, Cuda)
            outputIntPtr = cuda.HostAllocate(memSize, CUDADriver.CU_MEMHOSTALLOC_DEVICEMAP);
            outputPtr    = cuda.GetHostDevicePointer(outputIntPtr, 0);

            //normal memory allocation
            //outputPtr = cuda.Allocate((uint)(sizeof(float) * problemElements.Length));


            #endregion

            SetCudaFunctionParameters();

            //allocate memory for main vector, size of this vector is the same as dimenson, so many
            //indexes will be zero, but cuda computation is faster
            mainVector = new float[problemElements[0].Dim + 1];
            CudaHelpers.FillDenseVector(problemElements[0], mainVector);

            CudaHelpers.SetTextureMemory(cuda, cuModule, ref cuMainVecTexRef, cudaMainVecTexRefName, mainVector, ref mainVecPtr);

            CudaHelpers.SetTextureMemory(cuda, cuModule, ref cuLabelsTexRef, cudaLabelsTexRefName, Y, ref labelsPtr);

            if (MakeDenseVectorOnGPU)
            {
                vecBuilder = new EllpackDenseVectorBuilder(cuda, mainVecPtr, valsPtr, idxPtr, vecLengthPtr, problemElements.Length, problemElements[0].Dim);
                vecBuilder.Init();
            }
        }
        public override void Init()
        {
            linKernel.ProblemElements = problemElements;
            linKernel.Y = Y;
            linKernel.Init();

            base.Init();



            blockSize = threadsPerRow * sliceSize;
            int N = problemElements.Length;

            blockPerGrid = (int)Math.Ceiling(1.0 * N * threadsPerRow / blockSize);

            align = (int)Math.Ceiling(1.0 * sliceSize * threadsPerRow / 64) * 64;

            float[] vecVals;
            int[]   vecColIdx;
            int[]   vecLenght;
            int[]   sliceStart;

            CudaHelpers.TransformToSlicedEllpack(out vecVals, out vecColIdx, out sliceStart, out vecLenght, problemElements, threadsPerRow, sliceSize);

            selfLinDot = linKernel.DiagonalDotCache;

            #region cudafy initialization

            InitCudaModule();

            //copy data to device, set cuda function parameters
            valsPtr       = gpu.CopyToDevice(vecVals);
            idxPtr        = gpu.CopyToDevice(vecColIdx);
            vecLenghtPtr  = gpu.CopyToDevice(vecLenght);
            sliceStartPtr = gpu.CopyToDevice(sliceStart);
            //!!!!!
            selfLinDotPtr = gpu.CopyToDevice(selfLinDot);
            labelsPtr     = gpu.CopyToDevice(Y);



            //gpu.CopyToConstantMemory(new float[] { Gamma }, GammaDev);

            //float[] GammaDev =new float[] { Gamma };
            //float[] GammaDevPtr = gpu.Allocate<float>(1);
            //gpu.CopyToConstantMemory<float>(GammaDev,GammaDevPtr);

            //float[] Gammas = new float[] { Gamma };
            //float[] GammaDev = gpu.Allocate<float>(1);
            //gpu.CopyToConstantMemory<float>(Gammas, GammaDev);


            int memSize = (problemElements.Length * sizeof(float));


            //allocate mapped memory for our results
            //outputIntPtr = gpu.HostAllocate<float>(problemElements.Length); // .HostAllocate(memSize, CUDADriver.CU_MEMHOSTALLOC_DEVICEMAP);
            //outputPtr = gpu.GetDeviceMemoryFromIntPtr(outputIntPtr);// cuda.GetHostDevicePointer(outputIntPtr, 0);

            outputIntPtr = cuGPU.HostAllocate((uint)memSize, CUDADriver.CU_MEMHOSTALLOC_DEVICEMAP);
            outputPtr    = cuGPU.GetHostDevicePointer(outputIntPtr, 0);

            #endregion



            //allocate memory for main vector, size of this vector is the same as dimenson, so many
            //indexes will be zero, but cuda computation is faster
            mainVector = new float[problemElements[0].Dim + 1];
            CudaHelpers.FillDenseVector(problemElements[0], mainVector);



            CudaHelpers.SetTextureMemory(cuGPU, ref cuMainVecTexRef, cudaMainVecTexRefName, mainVector, ref mainVectorPtr);
            //CudaHelpers.SetTextureMemory(cuGPU, ref cuLabelsTexRef, cudaLabelsTexRefName, Y, ref labelsPtr);
        }