示例#1
0
        public override void Init()
        {
            base.Init();

            blockSize = threadsPerRow * sliceSize;
            int N = problemElements.Length;

            blocksPerGrid = (int)Math.Ceiling(1.0 * N * threadsPerRow / blockSize);

            align = (int)Math.Ceiling(1.0 * sliceSize * threadsPerRow / 64) * 64;


            float[] vecVals;
            int[]   vecColIdx;
            int[]   vecLenght;
            int[]   sliceStart;

            CudaHelpers.TransformToSlicedEllpack(out vecVals, out vecColIdx, out sliceStart, out vecLenght, problemElements, threadsPerRow, sliceSize);

            selfSum = problemElements.AsParallel().Select(x => x.Values.Sum()).ToArray();

            #region cuda initialization

            InitCudaModule();

            //copy data to device, set cuda function parameters
            valsPtr       = cuda.CopyHostToDevice(vecVals);
            idxPtr        = cuda.CopyHostToDevice(vecColIdx);
            vecLengthPtr  = cuda.CopyHostToDevice(vecLenght);
            sliceStartPtr = cuda.CopyHostToDevice(sliceStart);

            labelsPtr = cuda.CopyHostToDevice(Y);
            //!!!!!
            selfSumPtr = cuda.CopyHostToDevice(selfSum);

            uint memSize = (uint)(problemElements.Length * sizeof(float));

            outputIntPtr = cuda.HostAllocate(memSize, CUDADriver.CU_MEMHOSTALLOC_DEVICEMAP);
            outputPtr    = cuda.GetHostDevicePointer(outputIntPtr, 0);

            //normal memory allocation
            //outputPtr = cuda.Allocate((uint)(sizeof(float) * problemElements.Length));


            #endregion

            SetCudaFunctionParameters();

            //allocate memory for main vector, size of this vector is the same as dimension, so many
            //indexes will be zero, but cuda computation is faster
            mainVector = new float[problemElements[0].Dim + 1];
            CudaHelpers.FillDenseVector(problemElements[0], mainVector);

            CudaHelpers.SetTextureMemory(cuda, cuModule, ref cuMainVecTexRef, cudaMainVecTexRefName, mainVector, ref mainVecPtr);

            // CudaHelpers.SetTextureMemory(cuda,cuModule,ref cuLabelsTexRef, cudaLabelsTexRefName, Y, ref labelsPtr);
        }
        public override void Init()
        {
            linKernel.ProblemElements = problemElements;
            linKernel.Y = Y;
            linKernel.Init();

            base.Init();



            blockSize = threadsPerRow * sliceSize;
            int N = problemElements.Length;

            blockPerGrid = (int)Math.Ceiling(1.0 * N * threadsPerRow / blockSize);

            align = (int)Math.Ceiling(1.0 * sliceSize * threadsPerRow / 64) * 64;

            float[] vecVals;
            int[]   vecColIdx;
            int[]   vecLenght;
            int[]   sliceStart;

            CudaHelpers.TransformToSlicedEllpack(out vecVals, out vecColIdx, out sliceStart, out vecLenght, problemElements, threadsPerRow, sliceSize);

            selfLinDot = linKernel.DiagonalDotCache;

            #region cudafy initialization

            InitCudaModule();

            //copy data to device, set cuda function parameters
            valsPtr       = gpu.CopyToDevice(vecVals);
            idxPtr        = gpu.CopyToDevice(vecColIdx);
            vecLenghtPtr  = gpu.CopyToDevice(vecLenght);
            sliceStartPtr = gpu.CopyToDevice(sliceStart);
            //!!!!!
            selfLinDotPtr = gpu.CopyToDevice(selfLinDot);
            labelsPtr     = gpu.CopyToDevice(Y);



            //gpu.CopyToConstantMemory(new float[] { Gamma }, GammaDev);

            //float[] GammaDev =new float[] { Gamma };
            //float[] GammaDevPtr = gpu.Allocate<float>(1);
            //gpu.CopyToConstantMemory<float>(GammaDev,GammaDevPtr);

            //float[] Gammas = new float[] { Gamma };
            //float[] GammaDev = gpu.Allocate<float>(1);
            //gpu.CopyToConstantMemory<float>(Gammas, GammaDev);


            int memSize = (problemElements.Length * sizeof(float));


            //allocate mapped memory for our results
            //outputIntPtr = gpu.HostAllocate<float>(problemElements.Length); // .HostAllocate(memSize, CUDADriver.CU_MEMHOSTALLOC_DEVICEMAP);
            //outputPtr = gpu.GetDeviceMemoryFromIntPtr(outputIntPtr);// cuda.GetHostDevicePointer(outputIntPtr, 0);

            outputIntPtr = cuGPU.HostAllocate((uint)memSize, CUDADriver.CU_MEMHOSTALLOC_DEVICEMAP);
            outputPtr    = cuGPU.GetHostDevicePointer(outputIntPtr, 0);

            #endregion



            //allocate memory for main vector, size of this vector is the same as dimenson, so many
            //indexes will be zero, but cuda computation is faster
            mainVector = new float[problemElements[0].Dim + 1];
            CudaHelpers.FillDenseVector(problemElements[0], mainVector);



            CudaHelpers.SetTextureMemory(cuGPU, ref cuMainVecTexRef, cudaMainVecTexRefName, mainVector, ref mainVectorPtr);
            //CudaHelpers.SetTextureMemory(cuGPU, ref cuLabelsTexRef, cudaLabelsTexRefName, Y, ref labelsPtr);
        }