Ejemplo n.º 1
0
        public static void thekernel(GThread thread, SphereOpenCL[] s, byte[] ptr)
        {
            //SphereOpenCL localSphere = s[0];
            SphereOpenCL[] sharedSphere = thread.AllocateShared <SphereOpenCL>("sharedSphere", 16);
            int[]          sharedInt    = thread.AllocateShared <int>("sharedInt", 16);
            //float somefloat = GMath.Pow(localSphere.b, 2.0F);
            // map from threadIdx/BlockIdx to pixel position
            int   x      = thread.threadIdx.x + thread.blockIdx.x * thread.blockDim.x;
            int   y      = thread.threadIdx.y + thread.blockIdx.y * thread.blockDim.y;
            int   offset = x + y * thread.blockDim.x * thread.gridDim.x;
            float ox     = (x - ray_gui.DIM / 2);
            float oy     = (y - ray_gui.DIM / 2);

            float r = 0, g = 0, b = 0;
            float maxz = -INF;

            for (int i = 0; i < SPHERES; i++)
            {
                float n = 0;
                float t = hit(s[i], ox, oy, ref n);
                if (t > maxz)
                {
                    float fscale = n;
                    r    = s[i].r * fscale;
                    g    = s[i].g * fscale;
                    b    = s[i].b * fscale;
                    maxz = t;
                }
            }

            ptr[offset * 4 + 0] = (byte)(r * 255);
            ptr[offset * 4 + 1] = (byte)(g * 255);
            ptr[offset * 4 + 2] = (byte)(b * 255);
            ptr[offset * 4 + 3] = 255;
        }
Ejemplo n.º 2
0
        public static void GpuFindPathDistance(GThread thread, int permutations, int cities,
                                               float[] latitudes, float[] longitudes, AnswerStruct[] answer)
        {
            var threadIndex     = thread.threadIdx.x;           // thread index within the block
            var blockIndex      = thread.blockIdx.x;            // block index within the grid
            var threadsPerBlock = thread.blockDim.x;
            var blocksPerGrid   = thread.gridDim.x;
            var threadsPerGrid  = threadsPerBlock * blocksPerGrid;
            var permutation     = threadIndex + blockIndex * threadsPerBlock;

            var paths            = thread.AllocateShared <int>("path", _threadsPerBlock, _cities);
            var bestDistances    = thread.AllocateShared <float>("dist", _threadsPerBlock);
            var bestPermutations = thread.AllocateShared <int>("perm", _threadsPerBlock);

            var bestDistance    = float.MaxValue;
            var bestPermutation = 0;

            while (permutation < permutations)
            {
                var distance = FindPathDistance(permutations, permutation,
                                                cities, latitudes, longitudes, paths, threadIndex);
                if (distance < bestDistance)
                {
                    bestDistance    = distance;
                    bestPermutation = permutation;
                }
                permutation += threadsPerGrid;
            }

            bestDistances[threadIndex]    = bestDistance;
            bestPermutations[threadIndex] = bestPermutation;

            thread.SyncThreads();

            // credit: CUDA By Example, page 79:
            // http://www.amazon.com/CUDA-Example-Introduction-General-Purpose-Programming/dp/0131387685
            for (var i = threadsPerBlock / 2; i > 0; i /= 2)
            {
                if (threadIndex < i)
                {
                    if (bestDistances[threadIndex] > bestDistances[threadIndex + i])
                    {
                        bestDistances[threadIndex]    = bestDistances[threadIndex + i];
                        bestPermutations[threadIndex] = bestPermutations[threadIndex + i];
                    }
                }
                thread.SyncThreads();
            }

            if (threadIndex == 0)
            {
                answer[thread.blockIdx.x].distance = bestDistances[0];
                answer[thread.blockIdx.x].pathNo   = bestPermutations[0];
            }
        }
Ejemplo n.º 3
0
        public static void GpuFindPathDistance(GThread thread,
                                               long permutations, LatLongStruct[] gpuLatLong, AnswerStruct[] answer)
        {
            var threadsPerGrid   = thread.blockDim.x * thread.gridDim.x;
            var path             = thread.AllocateShared <int>("path", _cities, _threadsPerBlock);
            var bestDistances    = thread.AllocateShared <float>("dist", _threadsPerBlock);
            var bestPermutations = thread.AllocateShared <long> ("perm", _threadsPerBlock);

            var permutation     = (long)(thread.threadIdx.x + thread.blockIdx.x * thread.blockDim.x);
            var bestDistance    = float.MaxValue;
            var bestPermutation = 0L;

            while (permutation < permutations)
            {
                var distance = FindPathDistance(thread, permutations, permutation, gpuLatLong, path);
                if (distance < bestDistance)
                {
                    bestDistance    = distance;
                    bestPermutation = permutation;
                }
                permutation += threadsPerGrid;
            }

            bestDistances[thread.threadIdx.x]    = bestDistance;
            bestPermutations[thread.threadIdx.x] = bestPermutation;
            thread.SyncThreads();

            // credit: CUDA By Example, page 79:
            // http://www.amazon.com/CUDA-Example-Introduction-General-Purpose-Programming/dp/0131387685
            for (int i = thread.blockDim.x / 2; i > 0; i /= 2)
            {
                if (thread.threadIdx.x < i)
                {
                    if (bestDistances[thread.threadIdx.x] > bestDistances[thread.threadIdx.x + i])
                    {
                        bestDistances[thread.threadIdx.x]    = bestDistances[thread.threadIdx.x + i];
                        bestPermutations[thread.threadIdx.x] = bestPermutations[thread.threadIdx.x + i];
                    }
                }
                thread.SyncThreads();
            }

            if (thread.threadIdx.x == 0)
            {
                answer[thread.blockIdx.x].distance = bestDistances[0];
                answer[thread.blockIdx.x].pathNo   = bestPermutations[0];
            }
        }
Ejemplo n.º 4
0
        public static void Product(GThread thread, int[] a, int[] b, int[] c)
        {
            int tid = thread.threadIdx.x + thread.blockIdx.x * thread.blockDim.x;
            int[] cache = thread.AllocateShared<int>("cache", 4);
            int temp = 0;
            int cacheIndex=thread.threadIdx.x;
            while (tid < N)
            {
                temp = temp + a[tid] * b[tid];
                tid += thread.blockDim.x * thread.gridDim.x;
            }
            cache[thread.threadIdx.x] = temp;

            thread.SyncThreads();

            int i = thread.blockDim.x / 2;
            while (i != 0)
            {
                if (cacheIndex < i)
                {
                    cache[cacheIndex] += cache[cacheIndex + i];
                }
                thread.SyncThreads();

                i /= 2;
            }
            if (cacheIndex == 0)
            {
                c[thread.blockIdx.x] = cache[0];
            }
        }
        public static void histo_kernel(GThread thread, byte[] buffer, int size, uint[] histo)
        {
            // clear out the accumulation buffer called temp
            // since we are launched with 256 threads, it is easy
            // to clear that memory with one write per thread
            uint[] temp = thread.AllocateShared <uint>("temp", 256);
            temp[thread.threadIdx.x] = 0;
            thread.SyncThreads();

            // calculate the starting index and the offset to the next
            // block that each thread will be processing
            int i      = thread.threadIdx.x + thread.blockIdx.x * thread.blockDim.x;
            int stride = thread.blockDim.x * thread.gridDim.x;

            while (i < size)
            {
                thread.atomicAdd(ref temp[buffer[i]], 1);
                i += stride;
            }
            // sync the data from the above writes to shared memory
            // then add the shared memory values to the values from
            // the other thread blocks using global memory
            // atomic adds
            // same as before, since we have 256 threads, updating the
            // global histogram is just one write per thread!
            thread.SyncThreads();

            thread.atomicAdd(ref (histo[thread.threadIdx.x]), temp[thread.threadIdx.x]);
        }
Ejemplo n.º 6
0
        public static void histo_kernel(GThread thread, byte[] buffer, int size, uint[] histo) 
        {
            // clear out the accumulation buffer called temp
            // since we are launched with 256 threads, it is easy
            // to clear that memory with one write per thread
            uint[] temp = thread.AllocateShared<uint>("temp", 256);
            temp[thread.threadIdx.x] = 0;
            thread.SyncThreads();

            // calculate the starting index and the offset to the next
            // block that each thread will be processing
            int i = thread.threadIdx.x + thread.blockIdx.x * thread.blockDim.x;
            int stride = thread.blockDim.x * thread.gridDim.x;
            while (i < size) 
            {
                thread.atomicAdd(ref temp[buffer[i]], 1 );
                i += stride;
            }
            // sync the data from the above writes to shared memory
            // then add the shared memory values to the values from
            // the other thread blocks using global memory
            // atomic adds
            // same as before, since we have 256 threads, updating the
            // global histogram is just one write per thread!
            thread.SyncThreads();

            thread.atomicAdd(ref (histo[thread.threadIdx.x]), temp[thread.threadIdx.x]);
        }
Ejemplo n.º 7
0
        public static void Dot(GThread thread, float[] a, float[] b, float[] c)
        {
            float[] cache = thread.AllocateShared<float>("cache", threadsPerBlock);

            int tid = thread.threadIdx.x + thread.blockIdx.x * thread.blockDim.x;
            int cacheIndex = thread.threadIdx.x;

            float temp = 0;
            while (tid < N)
            {
                temp += a[tid] * b[tid];
                tid += thread.blockDim.x * thread.gridDim.x;
            }

            // set the cache values
            cache[cacheIndex] = temp;

            // synchronize threads in this block
            thread.SyncThreads();

            // for reductions, threadsPerBlock must be a power of 2
            // because of the following code
            int i = thread.blockDim.x / 2;
            while (i != 0)
            {
                if (cacheIndex < i)
                    cache[cacheIndex] += cache[cacheIndex + i];
                thread.SyncThreads();
                i /= 2;
            }

            if (cacheIndex == 0)
                c[thread.blockIdx.x] = cache[0];
        }
Ejemplo n.º 8
0
        public static void VectorAdd(GThread thread,
                                     [CudafyAddressSpace(eCudafyAddressSpace.Global)] int[] a,
                                     int[] b,
                                     int[] c)
        {
            int[] shared = thread.AllocateShared <int>("shared", Program.N);
            int   index  = thread.threadIdx.x + thread.blockIdx.x * thread.blockDim.x;

            //int index = thread.get_local_id(0);
            c[index] = (a[index] + b[index]) * ConstantMemory[index];
            thread.SyncThreads();
        }
Ejemplo n.º 9
0
        public static void copy2D(GThread thread, int[] result)
        {
            int[,] cache = thread.AllocateShared <int>("cache", XSIZE, YSIZE);
            int x = thread.blockIdx.x;
            int y = 0;

            while (y < YSIZE)
            {
                cache[x, y]           = Constant2D[x, y] * Constant2D.Rank;
                result[x * YSIZE + y] = cache[x, y];
                y++;
            }
        }
Ejemplo n.º 10
0
        public static void dot(GThread thread, float[] a, float[] b, float[] c)
        {
            float[] cache = thread.AllocateShared <float>("cache", threadsPerBlock);

            int tid        = thread.threadIdx.x + thread.blockIdx.x * thread.blockDim.x;
            int cacheIndex = thread.threadIdx.x;

            float temp = 0;

            while (tid < N)
            {
                temp += a[tid] * b[tid];
                tid  += thread.blockDim.x * thread.gridDim.x;
            }

            // set the cache values
            cache[cacheIndex] = temp;

            // synchronize threads in this block
            thread.SyncThreads();

            // for reductions, threadsPerBlock must be a power of 2
            // because of the following code
            int i = thread.blockDim.x / 2;

            while (i != 0)
            {
                if (cacheIndex < i)
                {
                    cache[cacheIndex] += cache[cacheIndex + i];
                }
                thread.SyncThreads();
                i /= 2;
            }

            if (cacheIndex == 0)
            {
                c[thread.blockIdx.x] = cache[0];
            }

            //callWithShared(cache);
        }
Ejemplo n.º 11
0
        public static void GpuFindPathDistance(GThread thread, AnswerStruct[] answer)
        {
            var answerLocal = thread.AllocateShared <AnswerStruct>("ansL", ThreadsPerBlock);

            var bestDistance    = thread.gridDim.x;
            var bestPermutation = thread.blockDim.x;

            var sum = 0;

            for (int i = 0; i < thread.blockDim.x; i++)
            {
                sum += i * thread.threadIdx.x;
            }

            answerLocal[thread.threadIdx.x].distance = bestDistance;
            answerLocal[thread.threadIdx.x].pathNo   = bestPermutation;
            thread.SyncThreads();

            if (thread.threadIdx.x == 0)
            {
                answer[thread.blockIdx.x] = answerLocal[0];
            }
        }
Ejemplo n.º 12
0
        public static void ComputeFitsKernel(GThread gThread, int edgeIndexA, int edgeIndexB, float[,] fit)
        {
            var sum = gThread.AllocateShared<float>("sum", 64);

            var tileIndexA = gThread.blockIdx.x;
            var tileIndexB = gThread.blockIdx.y;
            var pixelIndex = gThread.threadIdx.x;

            var diff = Edges[tileIndexA, edgeIndexA, pixelIndex] - Edges[tileIndexB, edgeIndexB, pixelIndex];
            sum[pixelIndex] = diff * diff;

            gThread.SyncThreads();

            for (var i = 64 / 2; i > 0; i /= 2)
            {
                if (pixelIndex < i)
                {
                    sum[pixelIndex] += sum[pixelIndex + i];
                }

                gThread.SyncThreads();
            }

            if (pixelIndex == 0)
            {
                fit[tileIndexA, tileIndexB] = sum[0];
            }
        }
Ejemplo n.º 13
0
        public static void GpuFindPathDistance(GThread thread, AnswerStruct[] answer)
        {
            var answerLocal = thread.AllocateShared<AnswerStruct>("ansL", ThreadsPerBlock);

            var bestDistance = thread.gridDim.x;
            var bestPermutation = thread.blockDim.x;

            var sum = 0;
            for (int i = 0; i < thread.blockDim.x; i++) sum += i * thread.threadIdx.x;

            answerLocal[thread.threadIdx.x].distance = bestDistance;
            answerLocal[thread.threadIdx.x].pathNo = bestPermutation;
            thread.SyncThreads();

            if (thread.threadIdx.x == 0)
            {
                answer[thread.blockIdx.x] = answerLocal[0];
            }
        }
        public static void GenerateRipples(GThread thread, int[] results)
        {
            var threadId = thread.blockIdx.y * thread.gridDim.x +           // offset to grid
                           thread.blockIdx.x +                              // index to grid
                           thread.threadIdx.y * thread.blockDim.x +         // offset to block in grid
                           thread.threadIdx.x;                              // index to block

            var shared = thread.AllocateShared <int>("sharedarray", 1024);

            // so i can breakpoint here
            shared[0] = 0;

            var threadPosInBlockX = 0;

            threadPosInBlockX = thread.threadIdx.x;

            var threadPosInBlockY = 0;

            threadPosInBlockY = thread.threadIdx.y;

            var blockPosInGridX = 0;

            blockPosInGridX = thread.blockIdx.x;

            var blockPosInGridY = 0;

            blockPosInGridY = thread.blockIdx.y;

            var gridSizeX = 0;

            gridSizeX = thread.gridDim.x;

            var gridSizeY = 0;

            gridSizeY = thread.gridDim.y;

            var blockSizeX = 0;

            blockSizeX = thread.blockDim.x;

            var blockSizeY = 0;

            blockSizeY = thread.blockDim.y;

            var threadX = blockSizeX * blockPosInGridX + threadPosInBlockX;

            //if i use only one variable, everything is fine:
            var threadY = blockSizeY;

            // this calculates just fine
            threadY = blockSizeY * blockPosInGridY + threadPosInBlockY;

            // hint: use NSight for Visual Studio and look at the NSight output,
            // it reports access violations and tells you where...

            // if our threadId is within bounds of array size
            // we cause access violation if not
            if (threadId < results.Length)
            {
                results[threadId] = threadX + threadY;
            }
        }
Ejemplo n.º 15
0
        public static void RepeatedAverageColorDifferenceKernel(GThread thread,
                                                                byte[] source, int sourceStride, int sourceWidth, int sourceHeight,
                                                                byte[] patterns, int patternsStride, int patternsWidth, int patternsHeight, int patternsOffsetX,
                                                                int patternStride, int patternWidth, int patternHeight,
                                                                int[,] patternLocations, int[,,] blockAvgs)
        {
            var threadX = thread.threadIdx.x;
            var threadY = thread.threadIdx.y;

            int x = thread.blockIdx.x * thread.blockDim.x + threadX;
            int y = thread.blockIdx.y * thread.blockDim.y + threadY;

            //Store px sums in shared memory
            var pixelAvgDistance = thread.AllocateShared <float>("pixelSums", (int)Filters.BlockSideLength, (int)Filters.BlockSideLength);

            pixelAvgDistance[threadX, threadY] = 0;

            var patternId    = x / patternWidth;
            var sourceStartX = patternLocations[patternId, 0];
            var sourceStartY = patternLocations[patternId, 1];

            if (x < patternsWidth && y < patternsHeight)
            {
                var patternIndex = 3 * (x + patternsOffsetX) + y * patternsStride;
                var sourceIndex  = 3 * ((x % patternWidth) + sourceStartX) + (y + sourceStartY) * sourceStride;

                pixelAvgDistance[threadX, threadY] = ComputePixelError(patterns, patternIndex, source, sourceIndex);

                //Paint pattern blue with the overlap average
                //patterns[patternIndex] = (byte)pixelAvgDistance[threadX, threadY];
            }



            //Wait till all block threads have finished
            thread.SyncThreads();

            //Use the first thread of each block to add up the block sums. CPU will need to add up the grid sum.
            if (threadX == 0 && threadY == 0)
            {
                float thisBlockAvg = 0;
                float nextBlockAvg = 0;

                //Add up the pixel sums to a block sum
                for (var ty = 0; ty < (int)Filters.BlockSideLength; ty++)
                {
                    for (var tx = 0; tx < (int)Filters.BlockSideLength; tx++)
                    {
                        if ((x + tx) / patternWidth == x / patternWidth)
                        {
                            thisBlockAvg += pixelAvgDistance[tx, ty];
                        }
                        else
                        {
                            nextBlockAvg += pixelAvgDistance[tx, ty];
                        }
                    }
                }

                var blocksPerPattern = (int)GMath.Ceiling(patternWidth / thread.blockDim.x);

                //Store the block's avgs
                if (thisBlockAvg > 0)
                {
                    blockAvgs[patternId, thread.blockIdx.x % blocksPerPattern, thread.blockIdx.y] = (int)GMath.Round(thisBlockAvg);
                }


                if (nextBlockAvg > 0)
                {
                    var isLastLastPattern = (patternsWidth / patternWidth) - 1 == patternId;

                    //Last block of this pattern is the first block of next pattern
                    if (!isLastLastPattern)
                    {
                        blockAvgs[patternId + 1, 0, thread.blockIdx.y] = (int)GMath.Round(nextBlockAvg);
                    }
                }
            }
        }
Ejemplo n.º 16
0
        public static void AverageColorDifferenceKernel(GThread thread,
                                                        byte[] source, int sourceStride, int sourceWidth, int sourceHeight,
                                                        byte[] pattern, int patternStride, int patternWidth, int patternHeight,
                                                        int sourceStartX, int sourceStartY, int[,] blockSums)
        {
            //Store px sums in shared memory
            var pixelSums = thread.AllocateShared <int>("pixelSums", (int)Filters.BlockSideLength, (int)Filters.BlockSideLength);

            pixelSums[thread.threadIdx.x, thread.threadIdx.y] = 0;

            int x = thread.blockIdx.x * thread.blockDim.x + thread.threadIdx.x;
            int y = thread.blockIdx.y * thread.blockDim.y + thread.threadIdx.y;

            if (x < patternWidth && y < patternHeight)
            {
                var sourceIndex  = 3 * (x + sourceStartX) + (y + sourceStartY) * sourceStride;
                var patternIndex = 3 * x + y * patternStride;

                //Ignore if close to black
                if (pattern[patternIndex] > 3)
                {
                    pixelSums[thread.threadIdx.x, thread.threadIdx.y] =
                        Distance(pattern[patternIndex], source[sourceIndex]) +
                        Distance(pattern[patternIndex + 1], source[sourceIndex + 1]) +
                        Distance(pattern[patternIndex + 2], source[sourceIndex + 2]);
                }
            }

            //Wait till all block threads have finished
            thread.SyncThreads();

            //Use the first thread of each block to add up the block sums. CPU will need to add up the grid sum.
            if (thread.threadIdx.x == 0 && thread.threadIdx.y == 0)
            {
                int blockSum = 0;

                //Add up the pixel sums to a block sum
                for (var ty = 0; ty < (int)Filters.BlockSideLength; ty++)
                {
                    for (var tx = 0; tx < (int)Filters.BlockSideLength; tx++)
                    {
                        blockSum += pixelSums[tx, ty];
                    }
                }

                //Store the block's sum
                blockSums[thread.blockIdx.x, thread.blockIdx.y] = blockSum;
            }

            //var window = (int)Filters.BlockSideLength / 2;

            //while (window > 0)
            //{
            //    if (thread.threadIdx.x < window)
            //        pixelSums[thread.threadIdx.x, thread.threadIdx.y] += pixelSums[thread.threadIdx.x+window, thread.threadIdx.y];

            //    window /= 2;

            //    thread.SyncThreads();
            //}

            //if (thread.threadIdx.x == 0 && thread.threadIdx.y == 0)
            //{
            //    int blockSum = 0;

            //    for (var ty = 0; ty < (int)Filters.BlockSideLength; ty++)
            //        blockSum += pixelSums[0, ty];

            //    //Store the block's sum
            //    blockSums[thread.blockIdx.x, thread.blockIdx.y] = blockSum;
            //}
        }
Ejemplo n.º 17
0
 public static void copy2D(GThread thread, int[] result)
 {
     int[,] cache = thread.AllocateShared<int>("cache", XSIZE, YSIZE);
     int x = thread.blockIdx.x;
     int y = 0;
     while (y < YSIZE)
     {
         cache[x, y] = Constant2D[x, y] * Constant2D.Rank;
         result[x * YSIZE + y] = cache[x, y];
         y++;
     }
 }
Ejemplo n.º 18
0
        public static void ExplorePermutationsKernel(GThread gThread, Evaluation[] evaluations)
        {
            var blockEvaluations = gThread.AllocateShared<Evaluation>("be", 256);
            var v = gThread.AllocateShared<byte>("v", 256, 9);
            var t = gThread.threadIdx.x;

            var permutation = gThread.blockIdx.x * gThread.blockDim.x + gThread.threadIdx.x;

            // 0 1 2
            // 3 4 5
            // 6 7 8

            TileOrderFromPermutation(Permutations, permutation, 9, v, t);

            var metric = 0f;

            metric += LeftRightFit[v[t, 0], v[t, 1]] + LeftRightFit[v[t, 1], v[t, 2]];
            metric += LeftRightFit[v[t, 3], v[t, 4]] + LeftRightFit[v[t, 4], v[t, 5]];
            metric += LeftRightFit[v[t, 6], v[t, 7]] + LeftRightFit[v[t, 7], v[t, 8]];

            metric += TopBottomFit[v[t, 0], v[t, 3]] + TopBottomFit[v[t, 3], v[t, 6]];
            metric += TopBottomFit[v[t, 1], v[t, 4]] + TopBottomFit[v[t, 4], v[t, 7]];
            metric += TopBottomFit[v[t, 2], v[t, 5]] + TopBottomFit[v[t, 5], v[t, 8]];

            blockEvaluations[t].Permutation = permutation;
            blockEvaluations[t].Metric = metric;

            gThread.SyncThreads();

            for (var i = 256 / 2; i > 0; i /= 2)
            {
                if (t < i)
                {
                    if (blockEvaluations[t].Metric > blockEvaluations[t + i].Metric)
                    {
                        blockEvaluations[t] = blockEvaluations[t + i];
                    }
                }

                gThread.SyncThreads();
            }

            if (gThread.threadIdx.x == 0)
            {
                evaluations[gThread.blockIdx.x] = blockEvaluations[0];
            }
        }
Ejemplo n.º 19
0
        private static void GpuConv2DKernelsGradient(GThread thread, float[] input, float[] gradient, float[,] resultPartials, GpuShape[] shapes, int paddingX, int paddingY, int stride)
        {
            /*
             * for (int kernelD = 0; kernelD < kernels.Depth; ++kernelD)
             * for (int kernelH = 0; kernelH < kernels.Height; ++kernelH)
             * for (int kernelW = 0; kernelW < kernels.Width; ++kernelW)
             * for (int kernelN = 0; kernelN < kernels.BatchSize; ++kernelN)
             * {
             *  for (int n = 0; n < gradient.BatchSize; ++n)
             *  for (int h = -paddingY, outH = 0; outH < gradient.Height; h += stride, ++outH)
             *  for (int w = -paddingX, outW = 0; outW < gradient.Width; w += stride, ++outW)
             *  {
             *      float grad = gradient[outW, outH, kernelN, n];
             *      float kernGradVal = input.TryGet(0, w + kernelW, h + kernelH, kernelD, n) * grad;
             *      kernelsGradient[kernelW, kernelH, kernelD, kernelN] += kernGradVal;
             *  }
             * }
             */

            // this shared memory will store partial sums that later on will be reduced
            float[] sdata = thread.AllocateShared <float>("sdata", THREADS_PER_BLOCK);

            int resultElemId = thread.blockIdx.x;
            int tid          = thread.threadIdx.x;
            int id           = (thread.blockDim.x * thread.blockIdx.y) + thread.threadIdx.x;

            int threadsRequiredPerResultElem = shapes[4].BatchSize * shapes[4].Height * shapes[4].Width;

            int kernelN = shapes[1].GetBatch(resultElemId);
            int kernelD = shapes[1].GetDepth(resultElemId);
            int kernelH = shapes[1].GetHeight(resultElemId);
            int kernelW = shapes[1].GetWidth(resultElemId);
            int n       = shapes[4].GetBatch(id);
            int outH    = shapes[4].GetHeight(id);
            int outW    = shapes[4].GetWidth(id);

            int h = -paddingY + stride * outH;
            int w = -paddingX + stride * outW;

            float temp = 0;

            if (id < threadsRequiredPerResultElem)
            {
                int inputIndex = shapes[0].TryGetIndex(w + kernelW, h + kernelH, kernelD, n);
                if (inputIndex >= 0)
                {
                    temp = input[inputIndex] * gradient[shapes[2].GetIndex(outW, outH, kernelN, n)];
                }

                //if (resultElemId == 0)
                //    Console.WriteLine("tid=%d - %f", id, temp);
            }
            sdata[tid] = temp;

            thread.SyncThreads();

            int i = thread.blockDim.x / 2;

            while (i != 0)
            {
                if (tid < i)
                {
                    sdata[tid] += sdata[tid + i];
                }
                thread.SyncThreads();
                i /= 2;
            }

            if (tid == 0)
            {
                //if (resultElemId == 0)
                //    Console.WriteLine("gridDim.x=%d gridDim.y=%d blockDim.x=%d blockDim.y=%d", thread.gridDim.x, thread.gridDim.y, thread.blockDim.x, thread.blockDim.y);
                resultPartials[thread.blockIdx.x, thread.blockIdx.y] = sdata[0];
            }
        }
Ejemplo n.º 20
0
        private static void GpuConv2DInputGradient(GThread thread, float[] gradient, float[] rotKernels, float[,] resultPartials, GpuShape[] shapes, int paddingX, int paddingY, int stride)
        {
            /*
             * for (int n = 0; n < gradients.BatchSize; ++n)
             * for (int outW = 0, w = -paddingX; outW < inputGradients.Width; w += stride, ++outW)
             * for (int outH = 0, h = -paddingY; outH < inputGradients.Height; h += stride, ++outH)
             * for (int outD = 0; outD < inputGradients.Depth; ++outD)
             * {
             *  for (int kernelN = 0; kernelN < rotKernels.BatchSize; ++kernelN)
             *  for (int kernelH = 0; kernelH < rotKernels.Height; ++kernelH)
             *  for (int kernelW = 0; kernelW < rotKernels.Width; ++kernelW)
             *      inputGradients[outW, outH, outD, n] += gradients.TryGet(0, w + kernelW, h + kernelH, kernelN, n) * rotKernels[kernelW, kernelH, outD, kernelN];
             * }
             */

            // this shared memory will store partial sums that later on will be reduced
            float[] sdata = thread.AllocateShared <float>("sdata", THREADS_PER_BLOCK);

            int resultElemId = thread.blockIdx.x;
            int tid          = thread.threadIdx.x;
            int id           = (thread.blockDim.x * thread.blockIdx.y) + thread.threadIdx.x;

            int threadsRequiredPerResultElem = shapes[1].BatchSize * shapes[1].Height * shapes[1].Width;

            int outN = shapes[2].GetBatch(resultElemId);
            int outD = shapes[2].GetDepth(resultElemId);
            int outH = shapes[2].GetHeight(resultElemId);
            int outW = shapes[2].GetWidth(resultElemId);

            int kernelN = shapes[3].GetBatch(id);
            int kernelH = shapes[3].GetHeight(id);
            int kernelW = shapes[3].GetWidth(id);

            int h = -paddingY + stride * outH;
            int w = -paddingX + stride * outW;

            float temp = 0;

            if (id < threadsRequiredPerResultElem)
            {
                int gradientIndex = shapes[0].TryGetIndex(w + kernelW, h + kernelH, kernelN, outN);
                if (gradientIndex >= 0)
                {
                    temp = gradient[gradientIndex] * rotKernels[shapes[1].GetIndex(kernelW, kernelH, outD, kernelN)];
                }
            }
            sdata[tid] = temp;

            thread.SyncThreads();

            int i = thread.blockDim.x / 2;

            while (i != 0)
            {
                if (tid < i)
                {
                    sdata[tid] += sdata[tid + i];
                }
                thread.SyncThreads();
                i /= 2;
            }

            if (tid == 0)
            {
                resultPartials[thread.blockIdx.x, thread.blockIdx.y] = sdata[0];
            }
        }
Ejemplo n.º 21
0
        //CPU launches kernels on GPU to process the data
        public static void calGPU(GThread thread, byte[] dev_bitmap1, byte[] dev_bitmap2, byte[] dev_result, int[] imageWidth, int[] count, int[] possition)
        {
            int i = (thread.blockIdx.x * thread.blockDim.x) + thread.threadIdx.x;
            int j = (thread.blockIdx.y * thread.blockDim.y) + thread.threadIdx.y;

            int[] sharedCount = thread.AllocateShared <int>("count1", 2);
            sharedCount[0] = 0;
            sharedCount[1] = 0;
            int tid = 0;

            //if (j < imageWidth[1] && i < imageWidth[0] * 3)
            //{
            int alpha_delta, red_delta, green_delta, blue_delta;

            //for (int tid = thread.blockIdx.x * thread.blockDim.x + thread.threadIdx.x; tid < 24;tid += thread.blockDim.x * thread.gridDim.x)
            //    {
            for (i = 0; i < imageWidth[1]; i += 1)
            {
                for (j = 0; j < imageWidth[0]; j += 1)
                {
                    tid = (i * imageWidth[0] + j) * 4;
                    //while (tid < 326)
                    //{
                    PixelData pixelColor1 = new PixelData();
                    PixelData pixelColor2 = new PixelData();

                    pixelColor1.red   = dev_bitmap1[tid + 2];
                    pixelColor1.green = dev_bitmap1[tid + 1];
                    pixelColor1.blue  = dev_bitmap1[tid];
                    pixelColor1.alpha = dev_bitmap1[tid + 3];

                    pixelColor2.green = dev_bitmap2[tid + 1];
                    pixelColor2.red   = dev_bitmap2[tid + 2];
                    pixelColor2.blue  = dev_bitmap2[tid];
                    pixelColor2.alpha = dev_bitmap2[tid + 3];

                    //if ((pixelColor1.red != pixelColor2.red) ||
                    //     (pixelColor1.green != pixelColor2.green) ||
                    //     (pixelColor1.blue != pixelColor2.blue) ||
                    //     (pixelColor1.alpha != pixelColor2.alpha))
                    if (pixelColor1.red > pixelColor2.red)
                    {
                        red_delta = pixelColor1.red - pixelColor2.red;
                    }
                    else
                    {
                        red_delta = pixelColor2.red - pixelColor1.red;
                    }

                    if (pixelColor1.alpha > pixelColor2.alpha)
                    {
                        alpha_delta = pixelColor1.alpha - pixelColor2.alpha;
                    }
                    else
                    {
                        alpha_delta = pixelColor2.alpha - pixelColor1.alpha;
                    }

                    if (pixelColor1.green > pixelColor2.green)
                    {
                        green_delta = pixelColor1.green - pixelColor2.green;
                    }
                    else
                    {
                        green_delta = pixelColor2.green - pixelColor1.green;
                    }

                    if (pixelColor1.blue > pixelColor2.blue)
                    {
                        blue_delta = pixelColor1.blue - pixelColor2.blue;
                    }
                    else
                    {
                        blue_delta = pixelColor2.blue - pixelColor1.blue;
                    }
                    if ((red_delta > 8) || (alpha_delta > 8) || (green_delta > 8) || (blue_delta > 8))
                    {
                        //thread.SyncThreads();
                        possition[sharedCount[1]++] = i; //(thread.blockIdx.x * thread.blockDim.x) + thread.threadIdx.x;
                        possition[sharedCount[1]++] = j; //(thread.blockIdx.y * thread.blockDim.y) + thread.threadIdx.y;

                        dev_result[sharedCount[0]++] = pixelColor2.blue;
                        dev_result[sharedCount[0]++] = pixelColor2.green;
                        dev_result[sharedCount[0]++] = pixelColor2.red;
                        dev_result[sharedCount[0]++] = pixelColor2.alpha;

                        //sharedCount[0] += 4;

                        //sharedCount[1] += 2;
                        count[1] = sharedCount[1];
                        count[0] = sharedCount[0];
                    }
                    // tid += thread.gridDim.x;
                }
            }
        }
Ejemplo n.º 22
0
 public static void VectorAdd(GThread thread,
                         [CudafyAddressSpace(eCudafyAddressSpace.Global)] int[] a,
                         int[] b,
                         int[] c )
 {
     int[] shared = thread.AllocateShared<int>("shared", Program.N);
     int index = thread.threadIdx.x + thread.blockIdx.x * thread.blockDim.x;
     //int index = thread.get_local_id(0);
     c[index] = (a[index] + b[index]) * ConstantMemory[index];
     thread.SyncThreads();
 }