public static void thekernel(GThread thread, SphereOpenCL[] s, byte[] ptr) { //SphereOpenCL localSphere = s[0]; SphereOpenCL[] sharedSphere = thread.AllocateShared <SphereOpenCL>("sharedSphere", 16); int[] sharedInt = thread.AllocateShared <int>("sharedInt", 16); //float somefloat = GMath.Pow(localSphere.b, 2.0F); // map from threadIdx/BlockIdx to pixel position int x = thread.threadIdx.x + thread.blockIdx.x * thread.blockDim.x; int y = thread.threadIdx.y + thread.blockIdx.y * thread.blockDim.y; int offset = x + y * thread.blockDim.x * thread.gridDim.x; float ox = (x - ray_gui.DIM / 2); float oy = (y - ray_gui.DIM / 2); float r = 0, g = 0, b = 0; float maxz = -INF; for (int i = 0; i < SPHERES; i++) { float n = 0; float t = hit(s[i], ox, oy, ref n); if (t > maxz) { float fscale = n; r = s[i].r * fscale; g = s[i].g * fscale; b = s[i].b * fscale; maxz = t; } } ptr[offset * 4 + 0] = (byte)(r * 255); ptr[offset * 4 + 1] = (byte)(g * 255); ptr[offset * 4 + 2] = (byte)(b * 255); ptr[offset * 4 + 3] = 255; }
public static void GpuFindPathDistance(GThread thread, int permutations, int cities, float[] latitudes, float[] longitudes, AnswerStruct[] answer) { var threadIndex = thread.threadIdx.x; // thread index within the block var blockIndex = thread.blockIdx.x; // block index within the grid var threadsPerBlock = thread.blockDim.x; var blocksPerGrid = thread.gridDim.x; var threadsPerGrid = threadsPerBlock * blocksPerGrid; var permutation = threadIndex + blockIndex * threadsPerBlock; var paths = thread.AllocateShared <int>("path", _threadsPerBlock, _cities); var bestDistances = thread.AllocateShared <float>("dist", _threadsPerBlock); var bestPermutations = thread.AllocateShared <int>("perm", _threadsPerBlock); var bestDistance = float.MaxValue; var bestPermutation = 0; while (permutation < permutations) { var distance = FindPathDistance(permutations, permutation, cities, latitudes, longitudes, paths, threadIndex); if (distance < bestDistance) { bestDistance = distance; bestPermutation = permutation; } permutation += threadsPerGrid; } bestDistances[threadIndex] = bestDistance; bestPermutations[threadIndex] = bestPermutation; thread.SyncThreads(); // credit: CUDA By Example, page 79: // http://www.amazon.com/CUDA-Example-Introduction-General-Purpose-Programming/dp/0131387685 for (var i = threadsPerBlock / 2; i > 0; i /= 2) { if (threadIndex < i) { if (bestDistances[threadIndex] > bestDistances[threadIndex + i]) { bestDistances[threadIndex] = bestDistances[threadIndex + i]; bestPermutations[threadIndex] = bestPermutations[threadIndex + i]; } } thread.SyncThreads(); } if (threadIndex == 0) { answer[thread.blockIdx.x].distance = bestDistances[0]; answer[thread.blockIdx.x].pathNo = bestPermutations[0]; } }
public static void GpuFindPathDistance(GThread thread, long permutations, LatLongStruct[] gpuLatLong, AnswerStruct[] answer) { var threadsPerGrid = thread.blockDim.x * thread.gridDim.x; var path = thread.AllocateShared <int>("path", _cities, _threadsPerBlock); var bestDistances = thread.AllocateShared <float>("dist", _threadsPerBlock); var bestPermutations = thread.AllocateShared <long> ("perm", _threadsPerBlock); var permutation = (long)(thread.threadIdx.x + thread.blockIdx.x * thread.blockDim.x); var bestDistance = float.MaxValue; var bestPermutation = 0L; while (permutation < permutations) { var distance = FindPathDistance(thread, permutations, permutation, gpuLatLong, path); if (distance < bestDistance) { bestDistance = distance; bestPermutation = permutation; } permutation += threadsPerGrid; } bestDistances[thread.threadIdx.x] = bestDistance; bestPermutations[thread.threadIdx.x] = bestPermutation; thread.SyncThreads(); // credit: CUDA By Example, page 79: // http://www.amazon.com/CUDA-Example-Introduction-General-Purpose-Programming/dp/0131387685 for (int i = thread.blockDim.x / 2; i > 0; i /= 2) { if (thread.threadIdx.x < i) { if (bestDistances[thread.threadIdx.x] > bestDistances[thread.threadIdx.x + i]) { bestDistances[thread.threadIdx.x] = bestDistances[thread.threadIdx.x + i]; bestPermutations[thread.threadIdx.x] = bestPermutations[thread.threadIdx.x + i]; } } thread.SyncThreads(); } if (thread.threadIdx.x == 0) { answer[thread.blockIdx.x].distance = bestDistances[0]; answer[thread.blockIdx.x].pathNo = bestPermutations[0]; } }
public static void Product(GThread thread, int[] a, int[] b, int[] c) { int tid = thread.threadIdx.x + thread.blockIdx.x * thread.blockDim.x; int[] cache = thread.AllocateShared<int>("cache", 4); int temp = 0; int cacheIndex=thread.threadIdx.x; while (tid < N) { temp = temp + a[tid] * b[tid]; tid += thread.blockDim.x * thread.gridDim.x; } cache[thread.threadIdx.x] = temp; thread.SyncThreads(); int i = thread.blockDim.x / 2; while (i != 0) { if (cacheIndex < i) { cache[cacheIndex] += cache[cacheIndex + i]; } thread.SyncThreads(); i /= 2; } if (cacheIndex == 0) { c[thread.blockIdx.x] = cache[0]; } }
public static void histo_kernel(GThread thread, byte[] buffer, int size, uint[] histo) { // clear out the accumulation buffer called temp // since we are launched with 256 threads, it is easy // to clear that memory with one write per thread uint[] temp = thread.AllocateShared <uint>("temp", 256); temp[thread.threadIdx.x] = 0; thread.SyncThreads(); // calculate the starting index and the offset to the next // block that each thread will be processing int i = thread.threadIdx.x + thread.blockIdx.x * thread.blockDim.x; int stride = thread.blockDim.x * thread.gridDim.x; while (i < size) { thread.atomicAdd(ref temp[buffer[i]], 1); i += stride; } // sync the data from the above writes to shared memory // then add the shared memory values to the values from // the other thread blocks using global memory // atomic adds // same as before, since we have 256 threads, updating the // global histogram is just one write per thread! thread.SyncThreads(); thread.atomicAdd(ref (histo[thread.threadIdx.x]), temp[thread.threadIdx.x]); }
public static void histo_kernel(GThread thread, byte[] buffer, int size, uint[] histo) { // clear out the accumulation buffer called temp // since we are launched with 256 threads, it is easy // to clear that memory with one write per thread uint[] temp = thread.AllocateShared<uint>("temp", 256); temp[thread.threadIdx.x] = 0; thread.SyncThreads(); // calculate the starting index and the offset to the next // block that each thread will be processing int i = thread.threadIdx.x + thread.blockIdx.x * thread.blockDim.x; int stride = thread.blockDim.x * thread.gridDim.x; while (i < size) { thread.atomicAdd(ref temp[buffer[i]], 1 ); i += stride; } // sync the data from the above writes to shared memory // then add the shared memory values to the values from // the other thread blocks using global memory // atomic adds // same as before, since we have 256 threads, updating the // global histogram is just one write per thread! thread.SyncThreads(); thread.atomicAdd(ref (histo[thread.threadIdx.x]), temp[thread.threadIdx.x]); }
public static void Dot(GThread thread, float[] a, float[] b, float[] c) { float[] cache = thread.AllocateShared<float>("cache", threadsPerBlock); int tid = thread.threadIdx.x + thread.blockIdx.x * thread.blockDim.x; int cacheIndex = thread.threadIdx.x; float temp = 0; while (tid < N) { temp += a[tid] * b[tid]; tid += thread.blockDim.x * thread.gridDim.x; } // set the cache values cache[cacheIndex] = temp; // synchronize threads in this block thread.SyncThreads(); // for reductions, threadsPerBlock must be a power of 2 // because of the following code int i = thread.blockDim.x / 2; while (i != 0) { if (cacheIndex < i) cache[cacheIndex] += cache[cacheIndex + i]; thread.SyncThreads(); i /= 2; } if (cacheIndex == 0) c[thread.blockIdx.x] = cache[0]; }
public static void VectorAdd(GThread thread, [CudafyAddressSpace(eCudafyAddressSpace.Global)] int[] a, int[] b, int[] c) { int[] shared = thread.AllocateShared <int>("shared", Program.N); int index = thread.threadIdx.x + thread.blockIdx.x * thread.blockDim.x; //int index = thread.get_local_id(0); c[index] = (a[index] + b[index]) * ConstantMemory[index]; thread.SyncThreads(); }
public static void copy2D(GThread thread, int[] result) { int[,] cache = thread.AllocateShared <int>("cache", XSIZE, YSIZE); int x = thread.blockIdx.x; int y = 0; while (y < YSIZE) { cache[x, y] = Constant2D[x, y] * Constant2D.Rank; result[x * YSIZE + y] = cache[x, y]; y++; } }
public static void dot(GThread thread, float[] a, float[] b, float[] c) { float[] cache = thread.AllocateShared <float>("cache", threadsPerBlock); int tid = thread.threadIdx.x + thread.blockIdx.x * thread.blockDim.x; int cacheIndex = thread.threadIdx.x; float temp = 0; while (tid < N) { temp += a[tid] * b[tid]; tid += thread.blockDim.x * thread.gridDim.x; } // set the cache values cache[cacheIndex] = temp; // synchronize threads in this block thread.SyncThreads(); // for reductions, threadsPerBlock must be a power of 2 // because of the following code int i = thread.blockDim.x / 2; while (i != 0) { if (cacheIndex < i) { cache[cacheIndex] += cache[cacheIndex + i]; } thread.SyncThreads(); i /= 2; } if (cacheIndex == 0) { c[thread.blockIdx.x] = cache[0]; } //callWithShared(cache); }
public static void GpuFindPathDistance(GThread thread, AnswerStruct[] answer) { var answerLocal = thread.AllocateShared <AnswerStruct>("ansL", ThreadsPerBlock); var bestDistance = thread.gridDim.x; var bestPermutation = thread.blockDim.x; var sum = 0; for (int i = 0; i < thread.blockDim.x; i++) { sum += i * thread.threadIdx.x; } answerLocal[thread.threadIdx.x].distance = bestDistance; answerLocal[thread.threadIdx.x].pathNo = bestPermutation; thread.SyncThreads(); if (thread.threadIdx.x == 0) { answer[thread.blockIdx.x] = answerLocal[0]; } }
public static void ComputeFitsKernel(GThread gThread, int edgeIndexA, int edgeIndexB, float[,] fit) { var sum = gThread.AllocateShared<float>("sum", 64); var tileIndexA = gThread.blockIdx.x; var tileIndexB = gThread.blockIdx.y; var pixelIndex = gThread.threadIdx.x; var diff = Edges[tileIndexA, edgeIndexA, pixelIndex] - Edges[tileIndexB, edgeIndexB, pixelIndex]; sum[pixelIndex] = diff * diff; gThread.SyncThreads(); for (var i = 64 / 2; i > 0; i /= 2) { if (pixelIndex < i) { sum[pixelIndex] += sum[pixelIndex + i]; } gThread.SyncThreads(); } if (pixelIndex == 0) { fit[tileIndexA, tileIndexB] = sum[0]; } }
public static void GpuFindPathDistance(GThread thread, AnswerStruct[] answer) { var answerLocal = thread.AllocateShared<AnswerStruct>("ansL", ThreadsPerBlock); var bestDistance = thread.gridDim.x; var bestPermutation = thread.blockDim.x; var sum = 0; for (int i = 0; i < thread.blockDim.x; i++) sum += i * thread.threadIdx.x; answerLocal[thread.threadIdx.x].distance = bestDistance; answerLocal[thread.threadIdx.x].pathNo = bestPermutation; thread.SyncThreads(); if (thread.threadIdx.x == 0) { answer[thread.blockIdx.x] = answerLocal[0]; } }
public static void GenerateRipples(GThread thread, int[] results) { var threadId = thread.blockIdx.y * thread.gridDim.x + // offset to grid thread.blockIdx.x + // index to grid thread.threadIdx.y * thread.blockDim.x + // offset to block in grid thread.threadIdx.x; // index to block var shared = thread.AllocateShared <int>("sharedarray", 1024); // so i can breakpoint here shared[0] = 0; var threadPosInBlockX = 0; threadPosInBlockX = thread.threadIdx.x; var threadPosInBlockY = 0; threadPosInBlockY = thread.threadIdx.y; var blockPosInGridX = 0; blockPosInGridX = thread.blockIdx.x; var blockPosInGridY = 0; blockPosInGridY = thread.blockIdx.y; var gridSizeX = 0; gridSizeX = thread.gridDim.x; var gridSizeY = 0; gridSizeY = thread.gridDim.y; var blockSizeX = 0; blockSizeX = thread.blockDim.x; var blockSizeY = 0; blockSizeY = thread.blockDim.y; var threadX = blockSizeX * blockPosInGridX + threadPosInBlockX; //if i use only one variable, everything is fine: var threadY = blockSizeY; // this calculates just fine threadY = blockSizeY * blockPosInGridY + threadPosInBlockY; // hint: use NSight for Visual Studio and look at the NSight output, // it reports access violations and tells you where... // if our threadId is within bounds of array size // we cause access violation if not if (threadId < results.Length) { results[threadId] = threadX + threadY; } }
public static void RepeatedAverageColorDifferenceKernel(GThread thread, byte[] source, int sourceStride, int sourceWidth, int sourceHeight, byte[] patterns, int patternsStride, int patternsWidth, int patternsHeight, int patternsOffsetX, int patternStride, int patternWidth, int patternHeight, int[,] patternLocations, int[,,] blockAvgs) { var threadX = thread.threadIdx.x; var threadY = thread.threadIdx.y; int x = thread.blockIdx.x * thread.blockDim.x + threadX; int y = thread.blockIdx.y * thread.blockDim.y + threadY; //Store px sums in shared memory var pixelAvgDistance = thread.AllocateShared <float>("pixelSums", (int)Filters.BlockSideLength, (int)Filters.BlockSideLength); pixelAvgDistance[threadX, threadY] = 0; var patternId = x / patternWidth; var sourceStartX = patternLocations[patternId, 0]; var sourceStartY = patternLocations[patternId, 1]; if (x < patternsWidth && y < patternsHeight) { var patternIndex = 3 * (x + patternsOffsetX) + y * patternsStride; var sourceIndex = 3 * ((x % patternWidth) + sourceStartX) + (y + sourceStartY) * sourceStride; pixelAvgDistance[threadX, threadY] = ComputePixelError(patterns, patternIndex, source, sourceIndex); //Paint pattern blue with the overlap average //patterns[patternIndex] = (byte)pixelAvgDistance[threadX, threadY]; } //Wait till all block threads have finished thread.SyncThreads(); //Use the first thread of each block to add up the block sums. CPU will need to add up the grid sum. if (threadX == 0 && threadY == 0) { float thisBlockAvg = 0; float nextBlockAvg = 0; //Add up the pixel sums to a block sum for (var ty = 0; ty < (int)Filters.BlockSideLength; ty++) { for (var tx = 0; tx < (int)Filters.BlockSideLength; tx++) { if ((x + tx) / patternWidth == x / patternWidth) { thisBlockAvg += pixelAvgDistance[tx, ty]; } else { nextBlockAvg += pixelAvgDistance[tx, ty]; } } } var blocksPerPattern = (int)GMath.Ceiling(patternWidth / thread.blockDim.x); //Store the block's avgs if (thisBlockAvg > 0) { blockAvgs[patternId, thread.blockIdx.x % blocksPerPattern, thread.blockIdx.y] = (int)GMath.Round(thisBlockAvg); } if (nextBlockAvg > 0) { var isLastLastPattern = (patternsWidth / patternWidth) - 1 == patternId; //Last block of this pattern is the first block of next pattern if (!isLastLastPattern) { blockAvgs[patternId + 1, 0, thread.blockIdx.y] = (int)GMath.Round(nextBlockAvg); } } } }
public static void AverageColorDifferenceKernel(GThread thread, byte[] source, int sourceStride, int sourceWidth, int sourceHeight, byte[] pattern, int patternStride, int patternWidth, int patternHeight, int sourceStartX, int sourceStartY, int[,] blockSums) { //Store px sums in shared memory var pixelSums = thread.AllocateShared <int>("pixelSums", (int)Filters.BlockSideLength, (int)Filters.BlockSideLength); pixelSums[thread.threadIdx.x, thread.threadIdx.y] = 0; int x = thread.blockIdx.x * thread.blockDim.x + thread.threadIdx.x; int y = thread.blockIdx.y * thread.blockDim.y + thread.threadIdx.y; if (x < patternWidth && y < patternHeight) { var sourceIndex = 3 * (x + sourceStartX) + (y + sourceStartY) * sourceStride; var patternIndex = 3 * x + y * patternStride; //Ignore if close to black if (pattern[patternIndex] > 3) { pixelSums[thread.threadIdx.x, thread.threadIdx.y] = Distance(pattern[patternIndex], source[sourceIndex]) + Distance(pattern[patternIndex + 1], source[sourceIndex + 1]) + Distance(pattern[patternIndex + 2], source[sourceIndex + 2]); } } //Wait till all block threads have finished thread.SyncThreads(); //Use the first thread of each block to add up the block sums. CPU will need to add up the grid sum. if (thread.threadIdx.x == 0 && thread.threadIdx.y == 0) { int blockSum = 0; //Add up the pixel sums to a block sum for (var ty = 0; ty < (int)Filters.BlockSideLength; ty++) { for (var tx = 0; tx < (int)Filters.BlockSideLength; tx++) { blockSum += pixelSums[tx, ty]; } } //Store the block's sum blockSums[thread.blockIdx.x, thread.blockIdx.y] = blockSum; } //var window = (int)Filters.BlockSideLength / 2; //while (window > 0) //{ // if (thread.threadIdx.x < window) // pixelSums[thread.threadIdx.x, thread.threadIdx.y] += pixelSums[thread.threadIdx.x+window, thread.threadIdx.y]; // window /= 2; // thread.SyncThreads(); //} //if (thread.threadIdx.x == 0 && thread.threadIdx.y == 0) //{ // int blockSum = 0; // for (var ty = 0; ty < (int)Filters.BlockSideLength; ty++) // blockSum += pixelSums[0, ty]; // //Store the block's sum // blockSums[thread.blockIdx.x, thread.blockIdx.y] = blockSum; //} }
public static void copy2D(GThread thread, int[] result) { int[,] cache = thread.AllocateShared<int>("cache", XSIZE, YSIZE); int x = thread.blockIdx.x; int y = 0; while (y < YSIZE) { cache[x, y] = Constant2D[x, y] * Constant2D.Rank; result[x * YSIZE + y] = cache[x, y]; y++; } }
public static void ExplorePermutationsKernel(GThread gThread, Evaluation[] evaluations) { var blockEvaluations = gThread.AllocateShared<Evaluation>("be", 256); var v = gThread.AllocateShared<byte>("v", 256, 9); var t = gThread.threadIdx.x; var permutation = gThread.blockIdx.x * gThread.blockDim.x + gThread.threadIdx.x; // 0 1 2 // 3 4 5 // 6 7 8 TileOrderFromPermutation(Permutations, permutation, 9, v, t); var metric = 0f; metric += LeftRightFit[v[t, 0], v[t, 1]] + LeftRightFit[v[t, 1], v[t, 2]]; metric += LeftRightFit[v[t, 3], v[t, 4]] + LeftRightFit[v[t, 4], v[t, 5]]; metric += LeftRightFit[v[t, 6], v[t, 7]] + LeftRightFit[v[t, 7], v[t, 8]]; metric += TopBottomFit[v[t, 0], v[t, 3]] + TopBottomFit[v[t, 3], v[t, 6]]; metric += TopBottomFit[v[t, 1], v[t, 4]] + TopBottomFit[v[t, 4], v[t, 7]]; metric += TopBottomFit[v[t, 2], v[t, 5]] + TopBottomFit[v[t, 5], v[t, 8]]; blockEvaluations[t].Permutation = permutation; blockEvaluations[t].Metric = metric; gThread.SyncThreads(); for (var i = 256 / 2; i > 0; i /= 2) { if (t < i) { if (blockEvaluations[t].Metric > blockEvaluations[t + i].Metric) { blockEvaluations[t] = blockEvaluations[t + i]; } } gThread.SyncThreads(); } if (gThread.threadIdx.x == 0) { evaluations[gThread.blockIdx.x] = blockEvaluations[0]; } }
private static void GpuConv2DKernelsGradient(GThread thread, float[] input, float[] gradient, float[,] resultPartials, GpuShape[] shapes, int paddingX, int paddingY, int stride) { /* * for (int kernelD = 0; kernelD < kernels.Depth; ++kernelD) * for (int kernelH = 0; kernelH < kernels.Height; ++kernelH) * for (int kernelW = 0; kernelW < kernels.Width; ++kernelW) * for (int kernelN = 0; kernelN < kernels.BatchSize; ++kernelN) * { * for (int n = 0; n < gradient.BatchSize; ++n) * for (int h = -paddingY, outH = 0; outH < gradient.Height; h += stride, ++outH) * for (int w = -paddingX, outW = 0; outW < gradient.Width; w += stride, ++outW) * { * float grad = gradient[outW, outH, kernelN, n]; * float kernGradVal = input.TryGet(0, w + kernelW, h + kernelH, kernelD, n) * grad; * kernelsGradient[kernelW, kernelH, kernelD, kernelN] += kernGradVal; * } * } */ // this shared memory will store partial sums that later on will be reduced float[] sdata = thread.AllocateShared <float>("sdata", THREADS_PER_BLOCK); int resultElemId = thread.blockIdx.x; int tid = thread.threadIdx.x; int id = (thread.blockDim.x * thread.blockIdx.y) + thread.threadIdx.x; int threadsRequiredPerResultElem = shapes[4].BatchSize * shapes[4].Height * shapes[4].Width; int kernelN = shapes[1].GetBatch(resultElemId); int kernelD = shapes[1].GetDepth(resultElemId); int kernelH = shapes[1].GetHeight(resultElemId); int kernelW = shapes[1].GetWidth(resultElemId); int n = shapes[4].GetBatch(id); int outH = shapes[4].GetHeight(id); int outW = shapes[4].GetWidth(id); int h = -paddingY + stride * outH; int w = -paddingX + stride * outW; float temp = 0; if (id < threadsRequiredPerResultElem) { int inputIndex = shapes[0].TryGetIndex(w + kernelW, h + kernelH, kernelD, n); if (inputIndex >= 0) { temp = input[inputIndex] * gradient[shapes[2].GetIndex(outW, outH, kernelN, n)]; } //if (resultElemId == 0) // Console.WriteLine("tid=%d - %f", id, temp); } sdata[tid] = temp; thread.SyncThreads(); int i = thread.blockDim.x / 2; while (i != 0) { if (tid < i) { sdata[tid] += sdata[tid + i]; } thread.SyncThreads(); i /= 2; } if (tid == 0) { //if (resultElemId == 0) // Console.WriteLine("gridDim.x=%d gridDim.y=%d blockDim.x=%d blockDim.y=%d", thread.gridDim.x, thread.gridDim.y, thread.blockDim.x, thread.blockDim.y); resultPartials[thread.blockIdx.x, thread.blockIdx.y] = sdata[0]; } }
private static void GpuConv2DInputGradient(GThread thread, float[] gradient, float[] rotKernels, float[,] resultPartials, GpuShape[] shapes, int paddingX, int paddingY, int stride) { /* * for (int n = 0; n < gradients.BatchSize; ++n) * for (int outW = 0, w = -paddingX; outW < inputGradients.Width; w += stride, ++outW) * for (int outH = 0, h = -paddingY; outH < inputGradients.Height; h += stride, ++outH) * for (int outD = 0; outD < inputGradients.Depth; ++outD) * { * for (int kernelN = 0; kernelN < rotKernels.BatchSize; ++kernelN) * for (int kernelH = 0; kernelH < rotKernels.Height; ++kernelH) * for (int kernelW = 0; kernelW < rotKernels.Width; ++kernelW) * inputGradients[outW, outH, outD, n] += gradients.TryGet(0, w + kernelW, h + kernelH, kernelN, n) * rotKernels[kernelW, kernelH, outD, kernelN]; * } */ // this shared memory will store partial sums that later on will be reduced float[] sdata = thread.AllocateShared <float>("sdata", THREADS_PER_BLOCK); int resultElemId = thread.blockIdx.x; int tid = thread.threadIdx.x; int id = (thread.blockDim.x * thread.blockIdx.y) + thread.threadIdx.x; int threadsRequiredPerResultElem = shapes[1].BatchSize * shapes[1].Height * shapes[1].Width; int outN = shapes[2].GetBatch(resultElemId); int outD = shapes[2].GetDepth(resultElemId); int outH = shapes[2].GetHeight(resultElemId); int outW = shapes[2].GetWidth(resultElemId); int kernelN = shapes[3].GetBatch(id); int kernelH = shapes[3].GetHeight(id); int kernelW = shapes[3].GetWidth(id); int h = -paddingY + stride * outH; int w = -paddingX + stride * outW; float temp = 0; if (id < threadsRequiredPerResultElem) { int gradientIndex = shapes[0].TryGetIndex(w + kernelW, h + kernelH, kernelN, outN); if (gradientIndex >= 0) { temp = gradient[gradientIndex] * rotKernels[shapes[1].GetIndex(kernelW, kernelH, outD, kernelN)]; } } sdata[tid] = temp; thread.SyncThreads(); int i = thread.blockDim.x / 2; while (i != 0) { if (tid < i) { sdata[tid] += sdata[tid + i]; } thread.SyncThreads(); i /= 2; } if (tid == 0) { resultPartials[thread.blockIdx.x, thread.blockIdx.y] = sdata[0]; } }
//CPU launches kernels on GPU to process the data public static void calGPU(GThread thread, byte[] dev_bitmap1, byte[] dev_bitmap2, byte[] dev_result, int[] imageWidth, int[] count, int[] possition) { int i = (thread.blockIdx.x * thread.blockDim.x) + thread.threadIdx.x; int j = (thread.blockIdx.y * thread.blockDim.y) + thread.threadIdx.y; int[] sharedCount = thread.AllocateShared <int>("count1", 2); sharedCount[0] = 0; sharedCount[1] = 0; int tid = 0; //if (j < imageWidth[1] && i < imageWidth[0] * 3) //{ int alpha_delta, red_delta, green_delta, blue_delta; //for (int tid = thread.blockIdx.x * thread.blockDim.x + thread.threadIdx.x; tid < 24;tid += thread.blockDim.x * thread.gridDim.x) // { for (i = 0; i < imageWidth[1]; i += 1) { for (j = 0; j < imageWidth[0]; j += 1) { tid = (i * imageWidth[0] + j) * 4; //while (tid < 326) //{ PixelData pixelColor1 = new PixelData(); PixelData pixelColor2 = new PixelData(); pixelColor1.red = dev_bitmap1[tid + 2]; pixelColor1.green = dev_bitmap1[tid + 1]; pixelColor1.blue = dev_bitmap1[tid]; pixelColor1.alpha = dev_bitmap1[tid + 3]; pixelColor2.green = dev_bitmap2[tid + 1]; pixelColor2.red = dev_bitmap2[tid + 2]; pixelColor2.blue = dev_bitmap2[tid]; pixelColor2.alpha = dev_bitmap2[tid + 3]; //if ((pixelColor1.red != pixelColor2.red) || // (pixelColor1.green != pixelColor2.green) || // (pixelColor1.blue != pixelColor2.blue) || // (pixelColor1.alpha != pixelColor2.alpha)) if (pixelColor1.red > pixelColor2.red) { red_delta = pixelColor1.red - pixelColor2.red; } else { red_delta = pixelColor2.red - pixelColor1.red; } if (pixelColor1.alpha > pixelColor2.alpha) { alpha_delta = pixelColor1.alpha - pixelColor2.alpha; } else { alpha_delta = pixelColor2.alpha - pixelColor1.alpha; } if (pixelColor1.green > pixelColor2.green) { green_delta = pixelColor1.green - pixelColor2.green; } else { green_delta = pixelColor2.green - pixelColor1.green; } if (pixelColor1.blue > pixelColor2.blue) { blue_delta = pixelColor1.blue - pixelColor2.blue; } else { blue_delta = pixelColor2.blue - pixelColor1.blue; } if ((red_delta > 8) || (alpha_delta > 8) || (green_delta > 8) || (blue_delta > 8)) { //thread.SyncThreads(); possition[sharedCount[1]++] = i; //(thread.blockIdx.x * thread.blockDim.x) + thread.threadIdx.x; possition[sharedCount[1]++] = j; //(thread.blockIdx.y * thread.blockDim.y) + thread.threadIdx.y; dev_result[sharedCount[0]++] = pixelColor2.blue; dev_result[sharedCount[0]++] = pixelColor2.green; dev_result[sharedCount[0]++] = pixelColor2.red; dev_result[sharedCount[0]++] = pixelColor2.alpha; //sharedCount[0] += 4; //sharedCount[1] += 2; count[1] = sharedCount[1]; count[0] = sharedCount[0]; } // tid += thread.gridDim.x; } } }
public static void VectorAdd(GThread thread, [CudafyAddressSpace(eCudafyAddressSpace.Global)] int[] a, int[] b, int[] c ) { int[] shared = thread.AllocateShared<int>("shared", Program.N); int index = thread.threadIdx.x + thread.blockIdx.x * thread.blockDim.x; //int index = thread.get_local_id(0); c[index] = (a[index] + b[index]) * ConstantMemory[index]; thread.SyncThreads(); }