예제 #1
0
 public static void Copy(FloatResidentArray res, FloatResidentArray src, int N)
 {
     Parallel.For(0, N, (i) =>
     {
         res[i] = src[i];
     });
 }
예제 #2
0
 public static void Saxpy(FloatResidentArray res, FloatResidentArray x, float alpha, FloatResidentArray y, int N)
 {
     Parallel.For(0, N, (i) =>
     {
         res[i] = x[i] + alpha * y[i];
     });
 }
예제 #3
0
        static void Main(string[] args)
        {
            // configure CUDA
            cudaDeviceProp prop;

            cuda.GetDeviceProperties(out prop, 0);
            const int BLOCK_DIM = 256;

            runner  = HybRunner.Cuda().SetDistrib(16 * prop.multiProcessorCount, 1, BLOCK_DIM, 1, 1, BLOCK_DIM * sizeof(float));
            wrapper = runner.Wrap(new Program());

            int                size = 1000000; // very slow convergence with no preconditioner
            SparseMatrix       A    = SparseMatrix.Laplacian_1D(size);
            FloatResidentArray B    = new FloatResidentArray(size);
            FloatResidentArray X    = new FloatResidentArray(size);

            int   maxiter = 1000;
            float eps     = 1.0e-09f;

            for (int i = 0; i < size; ++i)
            {
                B[i] = 1.0f; // right side
                X[i] = 0.0f; // starting point
            }

            ConjugateGradient(X, A, B, maxiter, eps);
        }
예제 #4
0
        public static void ConjugateGradient(FloatResidentArray X, SparseMatrix A, FloatResidentArray B, int maxiter, float eps)
        {
            int N = (int)B.Count;
            FloatResidentArray R  = new FloatResidentArray(N);
            FloatResidentArray P  = new FloatResidentArray(N);
            FloatResidentArray AP = new FloatResidentArray(N);

            A.RefreshDevice();
            X.RefreshDevice();
            B.RefreshDevice();

            wrapper.Fmsub(R, B, A, X, N);                       // R = B - A*X
            wrapper.Copy(P, R, N);
            int k = 0;

            while (k < maxiter)
            {
                wrapper.Multiply(AP, A, P, N);                  // AP = A*P
                float r     = ScalarProd(R, R, N);              // save <R|R>
                float alpha = r / ScalarProd(P, AP, N);         // alpha = <R|R> / <P|AP>
                wrapper.Saxpy(X, X, alpha, P, N);               // X = X - alpha*P
                wrapper.Saxpy(R, R, -alpha, AP, N);             // RR = R-alpha*AP
                float rr = ScalarProd(R, R, N);
                if (rr < eps * eps)
                {
                    break;
                }

                float beta = rr / r;
                wrapper.Saxpy(P, R, beta, P, N);                // P = R + beta*P
                ++k;
            }

            X.RefreshHost();
        }
예제 #5
0
        public SparseMatrix(Dictionary <int, float>[] from)
        {
            rows = new IntResidentArray(from.Length + 1);
            List <int>   _indices = new List <int>();
            List <float> _data    = new List <float>();

            int colCounter = 0;
            int i          = 0;

            for (; i < from.Length; ++i)
            {
                rows[i] = colCounter;
                foreach (var kvp in from[i])
                {
                    _data.Add(kvp.Value);
                    _indices.Add(kvp.Key);
                    colCounter += 1;
                }
            }
            rows[i] = colCounter;
            indices = new IntResidentArray(_indices.Count());
            for (i = 0; i < indices.Count; ++i)
            {
                indices[i] = _indices[i];
            }
            data = new FloatResidentArray(_data.Count());
            for (i = 0; i < data.Count; ++i)
            {
                data[i] = _data[i];
            }
        }
        public static void Total(FloatResidentArray a, int N, float[] total)
        {
            var cache = new SharedMemoryAllocator <float>().allocate(blockDim.x);

            int   tid        = threadIdx.x + blockDim.x * blockIdx.x;
            int   cacheIndex = threadIdx.x;
            float sum        = 0f;

            while (tid < N)
            {
                sum  = sum + a[tid];
                tid += blockDim.x * gridDim.x;
            }
            cache[cacheIndex] = sum;
            CUDAIntrinsics.__syncthreads();
            int i = blockDim.x / 2;

            while (i != 0)
            {
                if (cacheIndex < i)
                {
                    cache[cacheIndex] = cache[cacheIndex] + cache[cacheIndex + i];
                }
                CUDAIntrinsics.__syncthreads();
                i >>= 1;
            }

            if (cacheIndex == 0)
            {
                AtomicExpr.apply(ref total[0], cache[0], (x, y) => x + y);
            }
        }
예제 #7
0
 public SquareProblem(int N, int iter)
 {
     _N       = N;
     _h       = 1.0F / (float)_N;
     _invIter = 1.0F / (float)iter;
     _inner   = new FloatResidentArray((N - 1) * (N - 1));
     _iter    = iter;
 }
예제 #8
0
        public static void Multiply(FloatResidentArray res, SparseMatrix m, FloatResidentArray v, int N)
        {
            Parallel.For(0, N, (i) =>
            {
                int rowless = m.rows[i];
                int rowup   = m.rows[i + 1];
                float tmp   = 0.0F;
                for (int j = rowless; j < rowup; ++j)
                {
                    tmp += v[m.indices[j]] * m.data[j];
                }

                res[i] = tmp;
            });
        }
예제 #9
0
        public static void ScalarProd(int N, FloatResidentArray a, FloatResidentArray b, float[] result)
        {
            var cache      = new SharedMemoryAllocator <float>().allocate(blockDim.x);
            int tid        = threadIdx.x + blockDim.x * blockIdx.x;
            int cacheIndex = threadIdx.x;

            float tmp = 0.0F;

            while (tid < N)
            {
                tmp += a[tid] * b[tid];
                tid += blockDim.x * gridDim.x;
            }

            cache[cacheIndex] = tmp;

            CUDAIntrinsics.__syncthreads();

            int i = blockDim.x / 2;

            while (i != 0)
            {
                if (cacheIndex < i)
                {
                    cache[cacheIndex] += cache[cacheIndex + i];
                }

                CUDAIntrinsics.__syncthreads();
                i >>= 1;
            }

            if (cacheIndex == 0)
            {
                AtomicAdd(ref result[0], cache[0]);
            }
        }
예제 #10
0
        private static void ScalarProd(float[] result, FloatResidentArray r1, FloatResidentArray r2, int N)
        {
            var cache      = new SharedMemoryAllocator <float>().allocate(blockDim.x);
            int tid        = threadIdx.x + blockDim.x * blockIdx.x;
            int cacheIndex = threadIdx.x;

            float tmp = 0.0F;

            while (tid < N)
            {
                tmp += r1[tid] * r2[tid];
                tid += blockDim.x * gridDim.x;
            }

            cache[cacheIndex] = tmp;

            CUDAIntrinsics.__syncthreads();

            int i = blockDim.x / 2;

            while (i != 0)
            {
                if (cacheIndex < i)
                {
                    cache[cacheIndex] += cache[cacheIndex + i];
                }

                CUDAIntrinsics.__syncthreads();
                i >>= 1;
            }

            if (cacheIndex == 0)
            {
                AtomicExpr.apply(ref result[0], cache[0], (x, y) => x + y);
            }
        }
        static void Main(string[] args)
        {
            const int          N   = 1024 * 1024 * 32;
            FloatResidentArray arr = new FloatResidentArray(N);

            float[] res = new float[1];
            for (int i = 0; i < N; ++i)
            {
                arr[i] = 1.0F;
            }

            arr.RefreshDevice();
            var            runner = HybRunner.Cuda();
            cudaDeviceProp prop;

            cuda.GetDeviceProperties(out prop, 0);
            runner.SetDistrib(16 * prop.multiProcessorCount, 1, 128, 1, 1, 128 * sizeof(float));
            var wrapped = runner.Wrap(new Program());

            runner.saveAssembly();
            cuda.ERROR_CHECK((cudaError_t)(int)wrapped.Total(arr, N, res));
            cuda.ERROR_CHECK(cuda.DeviceSynchronize());
            Console.WriteLine(res[0]);
        }
예제 #12
0
 public static float ScalarProd(FloatResidentArray X, FloatResidentArray Y, int N)
 {
     return(inner_scalar_prod((float *)X.DevicePointer, (float *)Y.DevicePointer, N));
 }
예제 #13
0
 public static float ScalarProd(FloatResidentArray X, FloatResidentArray Y, int N)
 {
     return(inner_scalar_prod((float *)X.DevicePointer, (float *)Y.DevicePointer, N));
     //return ParallelEnumerable.Range(0, N).Sum(i => X[i] * Y[i]);
 }