public static void histo_kernel(GThread thread, byte[] buffer, int size, uint[] histo) { // clear out the accumulation buffer called temp // since we are launched with 256 threads, it is easy // to clear that memory with one write per thread uint[] temp = thread.AllocateShared <uint>("temp", 256); temp[thread.threadIdx.x] = 0; thread.SyncThreads(); // calculate the starting index and the offset to the next // block that each thread will be processing int i = thread.threadIdx.x + thread.blockIdx.x * thread.blockDim.x; int stride = thread.blockDim.x * thread.gridDim.x; while (i < size) { thread.atomicAdd(ref temp[buffer[i]], 1); i += stride; } // sync the data from the above writes to shared memory // then add the shared memory values to the values from // the other thread blocks using global memory // atomic adds // same as before, since we have 256 threads, updating the // global histogram is just one write per thread! thread.SyncThreads(); thread.atomicAdd(ref (histo[thread.threadIdx.x]), temp[thread.threadIdx.x]); }
public static void histo_kernel(GThread thread, byte[] buffer, int size, uint[] histo) { // clear out the accumulation buffer called temp // since we are launched with 256 threads, it is easy // to clear that memory with one write per thread uint[] temp = thread.AllocateShared<uint>("temp", 256); temp[thread.threadIdx.x] = 0; thread.SyncThreads(); // calculate the starting index and the offset to the next // block that each thread will be processing int i = thread.threadIdx.x + thread.blockIdx.x * thread.blockDim.x; int stride = thread.blockDim.x * thread.gridDim.x; while (i < size) { thread.atomicAdd(ref temp[buffer[i]], 1 ); i += stride; } // sync the data from the above writes to shared memory // then add the shared memory values to the values from // the other thread blocks using global memory // atomic adds // same as before, since we have 256 threads, updating the // global histogram is just one write per thread! thread.SyncThreads(); thread.atomicAdd(ref (histo[thread.threadIdx.x]), temp[thread.threadIdx.x]); }
public static void CalculateNeuralNetwork(GThread thread, float[] a, float[,,] b, float[] c) { int startIndex = thread.blockIdx.x * Utils.BLOCK_SIZE * Utils.CHUNK_SIZE + thread.threadIdx.x * Utils.CHUNK_SIZE; for (int layerIndex = 0; layerIndex < Utils.LAYER_SIZE; layerIndex++) { for (int i = 0; i < Utils.CHUNK_SIZE; i++) { int itemId = startIndex + i; float sum = 0; for (int j = 0; j < Utils.N; j++) { sum += b[layerIndex, itemId, j] * a[j]; } c[itemId] = sum; } thread.SyncThreads(); for (int i = 0; i < Utils.CHUNK_SIZE; i++) { int itemId = startIndex + i; a[itemId] = c[itemId]; } thread.SyncThreads(); } }
public static void Dot(GThread thread, float[] a, float[] b, float[] c) { float[] cache = thread.AllocateShared<float>("cache", threadsPerBlock); int tid = thread.threadIdx.x + thread.blockIdx.x * thread.blockDim.x; int cacheIndex = thread.threadIdx.x; float temp = 0; while (tid < N) { temp += a[tid] * b[tid]; tid += thread.blockDim.x * thread.gridDim.x; } // set the cache values cache[cacheIndex] = temp; // synchronize threads in this block thread.SyncThreads(); // for reductions, threadsPerBlock must be a power of 2 // because of the following code int i = thread.blockDim.x / 2; while (i != 0) { if (cacheIndex < i) cache[cacheIndex] += cache[cacheIndex + i]; thread.SyncThreads(); i /= 2; } if (cacheIndex == 0) c[thread.blockIdx.x] = cache[0]; }
public static void GpuFindPathDistance(GThread thread, int permutations, int cities, float[] latitudes, float[] longitudes, AnswerStruct[] answer) { var threadIndex = thread.threadIdx.x; // thread index within the block var blockIndex = thread.blockIdx.x; // block index within the grid var threadsPerBlock = thread.blockDim.x; var blocksPerGrid = thread.gridDim.x; var threadsPerGrid = threadsPerBlock * blocksPerGrid; var permutation = threadIndex + blockIndex * threadsPerBlock; var paths = thread.AllocateShared <int>("path", _threadsPerBlock, _cities); var bestDistances = thread.AllocateShared <float>("dist", _threadsPerBlock); var bestPermutations = thread.AllocateShared <int>("perm", _threadsPerBlock); var bestDistance = float.MaxValue; var bestPermutation = 0; while (permutation < permutations) { var distance = FindPathDistance(permutations, permutation, cities, latitudes, longitudes, paths, threadIndex); if (distance < bestDistance) { bestDistance = distance; bestPermutation = permutation; } permutation += threadsPerGrid; } bestDistances[threadIndex] = bestDistance; bestPermutations[threadIndex] = bestPermutation; thread.SyncThreads(); // credit: CUDA By Example, page 79: // http://www.amazon.com/CUDA-Example-Introduction-General-Purpose-Programming/dp/0131387685 for (var i = threadsPerBlock / 2; i > 0; i /= 2) { if (threadIndex < i) { if (bestDistances[threadIndex] > bestDistances[threadIndex + i]) { bestDistances[threadIndex] = bestDistances[threadIndex + i]; bestPermutations[threadIndex] = bestPermutations[threadIndex + i]; } } thread.SyncThreads(); } if (threadIndex == 0) { answer[thread.blockIdx.x].distance = bestDistances[0]; answer[thread.blockIdx.x].pathNo = bestPermutations[0]; } }
public static void PeaksCompare_Da(GThread thread, double[] LeftInsilicoMasses, double[] RightInsilicoMasses, double[] peakListMasses, double[] differencesArray, int[] MatchesFound, int[] arraySize, int[] numberOfMatches, double[] Tolerance, double[] modifiedIonsArray) { int tidx = thread.blockIdx.x * thread.blockDim.x + thread.threadIdx.x; int ILindex = tidx % LeftInsilicoMasses.Length; int IRindex = tidx % RightInsilicoMasses.Length; int MIindex = tidx % modifiedIonsArray.Length; int Pindex = tidx / LeftInsilicoMasses.Length; double peak = peakListMasses[Pindex]; // modified array index *** if (tidx < LeftInsilicoMasses.Length * peakListMasses.Length) { differencesArray[tidx] = LeftInsilicoMasses[ILindex] - peak; if (differencesArray[tidx] < 0) { differencesArray[tidx] = differencesArray[tidx] * -1; } thread.SyncThreads(); differencesArray[tidx + (LeftInsilicoMasses.Length * peakListMasses.Length)] = RightInsilicoMasses[IRindex] - peak; if (differencesArray[tidx + (LeftInsilicoMasses.Length * peakListMasses.Length)] < 0) { differencesArray[tidx + (LeftInsilicoMasses.Length * peakListMasses.Length)] = differencesArray[tidx + (LeftInsilicoMasses.Length * peakListMasses.Length)] * -1; } } thread.SyncThreads(); if (tidx < modifiedIonsArray.Length * peakListMasses.Length) { differencesArray[tidx + ((LeftInsilicoMasses.Length * peakListMasses.Length) + (RightInsilicoMasses.Length * peakListMasses.Length))] = modifiedIonsArray[MIindex] - peak; //thread.SyncThreads(); if (differencesArray[tidx + ((LeftInsilicoMasses.Length * peakListMasses.Length) + (RightInsilicoMasses.Length * peakListMasses.Length))] < 0) { differencesArray[tidx + ((LeftInsilicoMasses.Length * peakListMasses.Length) + (RightInsilicoMasses.Length * peakListMasses.Length))] = differencesArray[tidx + ((LeftInsilicoMasses.Length * peakListMasses.Length) + (RightInsilicoMasses.Length * peakListMasses.Length))] * -1; } } thread.SyncThreads(); if (differencesArray[tidx] < Tolerance[0]) { MatchesFound[tidx] = 1; } else { MatchesFound[tidx] = 0; } thread.SyncThreads(); }
public static void GpuFindPathDistance(GThread thread, long permutations, LatLongStruct[] gpuLatLong, AnswerStruct[] answer) { var threadsPerGrid = thread.blockDim.x * thread.gridDim.x; var path = thread.AllocateShared <int>("path", _cities, _threadsPerBlock); var bestDistances = thread.AllocateShared <float>("dist", _threadsPerBlock); var bestPermutations = thread.AllocateShared <long> ("perm", _threadsPerBlock); var permutation = (long)(thread.threadIdx.x + thread.blockIdx.x * thread.blockDim.x); var bestDistance = float.MaxValue; var bestPermutation = 0L; while (permutation < permutations) { var distance = FindPathDistance(thread, permutations, permutation, gpuLatLong, path); if (distance < bestDistance) { bestDistance = distance; bestPermutation = permutation; } permutation += threadsPerGrid; } bestDistances[thread.threadIdx.x] = bestDistance; bestPermutations[thread.threadIdx.x] = bestPermutation; thread.SyncThreads(); // credit: CUDA By Example, page 79: // http://www.amazon.com/CUDA-Example-Introduction-General-Purpose-Programming/dp/0131387685 for (int i = thread.blockDim.x / 2; i > 0; i /= 2) { if (thread.threadIdx.x < i) { if (bestDistances[thread.threadIdx.x] > bestDistances[thread.threadIdx.x + i]) { bestDistances[thread.threadIdx.x] = bestDistances[thread.threadIdx.x + i]; bestPermutations[thread.threadIdx.x] = bestPermutations[thread.threadIdx.x + i]; } } thread.SyncThreads(); } if (thread.threadIdx.x == 0) { answer[thread.blockIdx.x].distance = bestDistances[0]; answer[thread.blockIdx.x].pathNo = bestPermutations[0]; } }
public static void PrzeniesWynikDoMacierzySumGPU(GThread watek, float[,] macierzSum, float[] wyjscie, int[] warstwa) { int x = watek.blockIdx.x; macierzSum[warstwa[0], x] = wyjscie[x]; watek.SyncThreads(); }
public static void LiczDeltyWarstwyGPU(GThread watek, int[] warstwa, float[,] macierzDelt, float[,] macierzWyjsc, float[,] macierzSum) { int x = watek.blockIdx.x; macierzDelt[warstwa[0], x] = macierzSum[warstwa[0], x] * macierzWyjsc[warstwa[0], x] * (1 - macierzWyjsc[warstwa[0], x]); watek.SyncThreads(); }
public static void LiczDelteOstatniejWarstwyGPU(GThread watek, float[,] macierzDelt, float[,] macierzWyjsc, int[] warstwa, float[] odpowiedz) { int x = watek.blockIdx.x; macierzDelt[warstwa[0], x] = -1f * (macierzWyjsc[warstwa[0], x] - odpowiedz[x]) * macierzWyjsc[warstwa[0], x] * (1f - macierzWyjsc[warstwa[0], x]); watek.SyncThreads(); }
public static void Product(GThread thread, int[] a, int[] b, int[] c) { int tid = thread.threadIdx.x + thread.blockIdx.x * thread.blockDim.x; int[] cache = thread.AllocateShared<int>("cache", 4); int temp = 0; int cacheIndex=thread.threadIdx.x; while (tid < N) { temp = temp + a[tid] * b[tid]; tid += thread.blockDim.x * thread.gridDim.x; } cache[thread.threadIdx.x] = temp; thread.SyncThreads(); int i = thread.blockDim.x / 2; while (i != 0) { if (cacheIndex < i) { cache[cacheIndex] += cache[cacheIndex + i]; } thread.SyncThreads(); i /= 2; } if (cacheIndex == 0) { c[thread.blockIdx.x] = cache[0]; } }
public static void UzupelnijWejscie(GThread watek, float[,] macierzWejsc, float[] wektorWejsciowy) { int x = watek.blockIdx.x; macierzWejsc[0, x] = wektorWejsciowy[x]; watek.SyncThreads(); // watek.SyncThreads(); }
public static void ZerujWektorFloat(GThread watek, float[] wektor) { int x = watek.blockIdx.x; wektor[x] = 0f; watek.SyncThreads(); // watek.SyncThreads(); }
public static void UzupelnijWarstwe(GThread watek, float[, ,] neuron, float[,] macierzWejsc, float[,] macierzWyjsc, int[] warstwa) { int x = watek.blockIdx.x; int y = watek.blockIdx.y; watek.atomicAdd(ref macierzWyjsc[warstwa[0], x], neuron[warstwa[0], x, y] * macierzWejsc[warstwa[0], y]); macierzWejsc[warstwa[0] + 1, x] = macierzWyjsc[warstwa[0], x]; watek.SyncThreads(); }
public static void ZerujMacierzFloat(GThread watek, float[,] macierz) { int x = watek.blockIdx.x; int y = watek.blockIdx.y; macierz[x, y] = 0f; watek.SyncThreads(); // watek.SyncThreads(); }
public static void PoliczFunkcjeAktywacji(GThread watek, int[] warstwa, float[,] macierzWejsc, float[,] macierzWyjsc, float[] sumy) { int x = watek.blockIdx.x; macierzWyjsc[warstwa[0], x] = 1f / (1f + GMath.Exp(-sumy[x])); macierzWejsc[warstwa[0] + 1, x] = macierzWyjsc[warstwa[0], x]; watek.SyncThreads(); // watek.SyncThreads(); }
public static void LiczWarstweGPU(GThread watek, float[, ,] neuron, float[] wyjscie, int[] warstwa, float[,] macierzWejsc, float[,] macierzSum) { int x = watek.blockIdx.x; int y = watek.blockIdx.y; watek.atomicAdd(ref wyjscie[x], neuron[warstwa[0], x, y] * macierzWejsc[warstwa[0], y]); watek.SyncThreads(); //wyjscie[x] += neuron[warstwa[0], x, y] * macierzWejsc[warstwa[0], y]; }
public static void LiczSumyWarstwyGPU(GThread watek, int[] warstwa, float[,] macierzDelt, float[,] macierzSum, float[, ,] neuron) { int x = watek.blockIdx.x; int y = watek.blockIdx.y; watek.atomicAdd(ref macierzSum[warstwa[0], x], macierzDelt[warstwa[0] + 1, y] * neuron[warstwa[0] + 1, y, x]); //macierzSum[warstwa[0], x] += macierzDelt[warstwa[0] + 1, y] * neuron[warstwa[0] + 1, y, x]; watek.SyncThreads(); }
public static void dot(GThread thread, float[] a, float[] b, float[] c) { float[] cache = thread.AllocateShared <float>("cache", threadsPerBlock); int tid = thread.threadIdx.x + thread.blockIdx.x * thread.blockDim.x; int cacheIndex = thread.threadIdx.x; float temp = 0; while (tid < N) { temp += a[tid] * b[tid]; tid += thread.blockDim.x * thread.gridDim.x; } // set the cache values cache[cacheIndex] = temp; // synchronize threads in this block thread.SyncThreads(); // for reductions, threadsPerBlock must be a power of 2 // because of the following code int i = thread.blockDim.x / 2; while (i != 0) { if (cacheIndex < i) { cache[cacheIndex] += cache[cacheIndex + i]; } thread.SyncThreads(); i /= 2; } if (cacheIndex == 0) { c[thread.blockIdx.x] = cache[0]; } //callWithShared(cache); }
public static void VectorAdd(GThread thread, [CudafyAddressSpace(eCudafyAddressSpace.Global)] int[] a, int[] b, int[] c) { int[] shared = thread.AllocateShared <int>("shared", Program.N); int index = thread.threadIdx.x + thread.blockIdx.x * thread.blockDim.x; //int index = thread.get_local_id(0); c[index] = (a[index] + b[index]) * ConstantMemory[index]; thread.SyncThreads(); }
public static void Dot(GThread thread, int[,] table1, int[] output_table, char[] matched_result, char[] input, int[] size1) { int cacheIndex = thread.threadIdx.x; int tid = thread.threadIdx.x + thread.blockIdx.x * thread.blockDim.x; int pos = tid; int state = 0; int start; char ch; int size2 = size1[0]; while (pos < size2) { thread.SyncThreads(); start = pos; state = 0; while ((state != -1) && (pos < size2)) { ch = input[pos]; thread.SyncThreads(); int nextState = table1[state, (int)(ch) - (int)('A')]; pos = pos + 1; if (nextState != -1) { int matchVec = output_table[nextState]; if (matchVec > 0) { matched_result[start] = (char)matchVec; } thread.SyncThreads(); } state = nextState; } pos = start + N * thread.gridDim.x; } }
public static void UaktualnijWagiNeuronowGPU(GThread watek, float[, ,] neuron, float[,] macierzDelt, float[,] macierzWejsc, float[] stala) { int x = watek.blockIdx.x; int y = watek.blockIdx.y; int z = 0; int len = neuron.GetLength(2); //neuron[x, y, z] += stala[0] * macierzDelt[x, y] * macierzWejsc[x, z]; while (z < len) { watek.atomicAdd(ref neuron[x, y, z], stala[0] * macierzDelt[x, y] * macierzWejsc[x, z]); z++; } watek.SyncThreads(); //neuron[x, y, z+x] = 1; }
public static void PFACAnalyse(GThread thread, byte[] buffer, int initialState, int[,] lookup, int[] targetEndLength, uint[] resultCount, int[] foundCount, byte[] foundID, int[] foundSOF) { int n = buffer.Length; int i = thread.threadIdx.x + thread.blockIdx.x * thread.blockDim.x; // Counter for i int stride = thread.blockDim.x * thread.gridDim.x; // Stride is the next byte for the thread to go to for (; i < n; i += stride) // Loop to scan full file segment { int state = initialState; int pos = i; while (pos < n) { state = lookup[state, buffer[pos]]; if (state == 0) { break; } if (state < initialState) { if ((state - 1) % 2 == 0) { thread.atomicAdd(ref resultCount[(int)((state + 1) / 2) - 1], 1); int counter = thread.atomicAdd(ref foundCount[0], 1); foundID[counter] = (byte)state; foundSOF[counter] = i; } else { int fileEnd = i + targetEndLength[((state + 1) / 2) - 1]; if (buffer[fileEnd] != 0x38 && buffer[fileEnd + 1] != 0x38 && buffer[fileEnd + 1] != 0x3B) { int counter = thread.atomicAdd(ref foundCount[0], 1); foundID[counter] = (byte)state; foundSOF[counter] = i; } } } pos++; } } thread.SyncThreads(); // Sync GPU threads }
public static void FindPixel(GThread thread, GPUColorBGRA[] rgbColors, GPUColorBGRA[] colors, int[] indices, float[] output) { //int[] cache = thread.AllocateShared<int>("cache", 1025); //thread.SyncThreads(); //int offset = thread.gridDim.x * thread.threadIdx.x + indices[thread.threadIdx.x]; //2025 * 0 + 0 -> 2025 * 0 + 1 //thread.SyncThreads(); //thread.SyncThreadsCount(true); //indices[thread.threadIdx.x]++; ////float[] cache = thread.AllocateShared<float>("cache", screenWidth * screenHeight / blockSizeX); int o = thread.threadIdx.x /*0 bis threadcount*/ + thread.blockDim.x /*1024*/ * thread.blockIdx.x /*+1 in 1024 schritten*/; //int offset = thread.gridDim.x * thread.threadIdx.x + indices[thread.threadIdx.x]++; //2025 * 0 + 0 -> 2025 * 0 + 1 //cache[offset] = -1; output[o] = 0; for (int i = 0; i < colors.Length; i++) { //if (rgbColors[o].Red + 5 >= colors[i].Red && rgbColors[o].Red - 5 <= colors[i].Red // && rgbColors[o].Green + 5 >= colors[i].Green && rgbColors[o].Green - 5 <= colors[i].Green // && rgbColors[o].Blue + 5 >= colors[i].Blue && rgbColors[o].Blue - 5 <= colors[i].Blue) if (rgbColors[o].Red == colors[i].Red && rgbColors[o].Green == colors[i].Green && rgbColors[o].Blue == colors[i].Blue) { //cache[offset] = o; thread.SyncThreads(); output[o] = o; break; } } //indices[thread.threadIdx.x]++; //thread.SyncThreads(); //output[offset] = cache[offset]; }
public static void GpuFindPathDistance(GThread thread, AnswerStruct[] answer) { var answerLocal = thread.AllocateShared <AnswerStruct>("ansL", ThreadsPerBlock); var bestDistance = thread.gridDim.x; var bestPermutation = thread.blockDim.x; var sum = 0; for (int i = 0; i < thread.blockDim.x; i++) { sum += i * thread.threadIdx.x; } answerLocal[thread.threadIdx.x].distance = bestDistance; answerLocal[thread.threadIdx.x].pathNo = bestPermutation; thread.SyncThreads(); if (thread.threadIdx.x == 0) { answer[thread.blockIdx.x] = answerLocal[0]; } }
private static void GpuConv2DKernelsGradient(GThread thread, float[] input, float[] gradient, float[,] resultPartials, GpuShape[] shapes, int paddingX, int paddingY, int stride) { /* * for (int kernelD = 0; kernelD < kernels.Depth; ++kernelD) * for (int kernelH = 0; kernelH < kernels.Height; ++kernelH) * for (int kernelW = 0; kernelW < kernels.Width; ++kernelW) * for (int kernelN = 0; kernelN < kernels.BatchSize; ++kernelN) * { * for (int n = 0; n < gradient.BatchSize; ++n) * for (int h = -paddingY, outH = 0; outH < gradient.Height; h += stride, ++outH) * for (int w = -paddingX, outW = 0; outW < gradient.Width; w += stride, ++outW) * { * float grad = gradient[outW, outH, kernelN, n]; * float kernGradVal = input.TryGet(0, w + kernelW, h + kernelH, kernelD, n) * grad; * kernelsGradient[kernelW, kernelH, kernelD, kernelN] += kernGradVal; * } * } */ // this shared memory will store partial sums that later on will be reduced float[] sdata = thread.AllocateShared <float>("sdata", THREADS_PER_BLOCK); int resultElemId = thread.blockIdx.x; int tid = thread.threadIdx.x; int id = (thread.blockDim.x * thread.blockIdx.y) + thread.threadIdx.x; int threadsRequiredPerResultElem = shapes[4].BatchSize * shapes[4].Height * shapes[4].Width; int kernelN = shapes[1].GetBatch(resultElemId); int kernelD = shapes[1].GetDepth(resultElemId); int kernelH = shapes[1].GetHeight(resultElemId); int kernelW = shapes[1].GetWidth(resultElemId); int n = shapes[4].GetBatch(id); int outH = shapes[4].GetHeight(id); int outW = shapes[4].GetWidth(id); int h = -paddingY + stride * outH; int w = -paddingX + stride * outW; float temp = 0; if (id < threadsRequiredPerResultElem) { int inputIndex = shapes[0].TryGetIndex(w + kernelW, h + kernelH, kernelD, n); if (inputIndex >= 0) { temp = input[inputIndex] * gradient[shapes[2].GetIndex(outW, outH, kernelN, n)]; } //if (resultElemId == 0) // Console.WriteLine("tid=%d - %f", id, temp); } sdata[tid] = temp; thread.SyncThreads(); int i = thread.blockDim.x / 2; while (i != 0) { if (tid < i) { sdata[tid] += sdata[tid + i]; } thread.SyncThreads(); i /= 2; } if (tid == 0) { //if (resultElemId == 0) // Console.WriteLine("gridDim.x=%d gridDim.y=%d blockDim.x=%d blockDim.y=%d", thread.gridDim.x, thread.gridDim.y, thread.blockDim.x, thread.blockDim.y); resultPartials[thread.blockIdx.x, thread.blockIdx.y] = sdata[0]; } }
public static void GpuFindPathDistance(GThread thread, AnswerStruct[] answer) { var answerLocal = thread.AllocateShared<AnswerStruct>("ansL", ThreadsPerBlock); var bestDistance = thread.gridDim.x; var bestPermutation = thread.blockDim.x; var sum = 0; for (int i = 0; i < thread.blockDim.x; i++) sum += i * thread.threadIdx.x; answerLocal[thread.threadIdx.x].distance = bestDistance; answerLocal[thread.threadIdx.x].pathNo = bestPermutation; thread.SyncThreads(); if (thread.threadIdx.x == 0) { answer[thread.blockIdx.x] = answerLocal[0]; } }
public static void RepeatedAverageColorDifferenceKernel(GThread thread, byte[] source, int sourceStride, int sourceWidth, int sourceHeight, byte[] patterns, int patternsStride, int patternsWidth, int patternsHeight, int patternsOffsetX, int patternStride, int patternWidth, int patternHeight, int[,] patternLocations, int[,,] blockAvgs) { var threadX = thread.threadIdx.x; var threadY = thread.threadIdx.y; int x = thread.blockIdx.x * thread.blockDim.x + threadX; int y = thread.blockIdx.y * thread.blockDim.y + threadY; //Store px sums in shared memory var pixelAvgDistance = thread.AllocateShared <float>("pixelSums", (int)Filters.BlockSideLength, (int)Filters.BlockSideLength); pixelAvgDistance[threadX, threadY] = 0; var patternId = x / patternWidth; var sourceStartX = patternLocations[patternId, 0]; var sourceStartY = patternLocations[patternId, 1]; if (x < patternsWidth && y < patternsHeight) { var patternIndex = 3 * (x + patternsOffsetX) + y * patternsStride; var sourceIndex = 3 * ((x % patternWidth) + sourceStartX) + (y + sourceStartY) * sourceStride; pixelAvgDistance[threadX, threadY] = ComputePixelError(patterns, patternIndex, source, sourceIndex); //Paint pattern blue with the overlap average //patterns[patternIndex] = (byte)pixelAvgDistance[threadX, threadY]; } //Wait till all block threads have finished thread.SyncThreads(); //Use the first thread of each block to add up the block sums. CPU will need to add up the grid sum. if (threadX == 0 && threadY == 0) { float thisBlockAvg = 0; float nextBlockAvg = 0; //Add up the pixel sums to a block sum for (var ty = 0; ty < (int)Filters.BlockSideLength; ty++) { for (var tx = 0; tx < (int)Filters.BlockSideLength; tx++) { if ((x + tx) / patternWidth == x / patternWidth) { thisBlockAvg += pixelAvgDistance[tx, ty]; } else { nextBlockAvg += pixelAvgDistance[tx, ty]; } } } var blocksPerPattern = (int)GMath.Ceiling(patternWidth / thread.blockDim.x); //Store the block's avgs if (thisBlockAvg > 0) { blockAvgs[patternId, thread.blockIdx.x % blocksPerPattern, thread.blockIdx.y] = (int)GMath.Round(thisBlockAvg); } if (nextBlockAvg > 0) { var isLastLastPattern = (patternsWidth / patternWidth) - 1 == patternId; //Last block of this pattern is the first block of next pattern if (!isLastLastPattern) { blockAvgs[patternId + 1, 0, thread.blockIdx.y] = (int)GMath.Round(nextBlockAvg); } } } }
public static void ExplorePermutationsKernel(GThread gThread, Evaluation[] evaluations) { var blockEvaluations = gThread.AllocateShared<Evaluation>("be", 256); var v = gThread.AllocateShared<byte>("v", 256, 9); var t = gThread.threadIdx.x; var permutation = gThread.blockIdx.x * gThread.blockDim.x + gThread.threadIdx.x; // 0 1 2 // 3 4 5 // 6 7 8 TileOrderFromPermutation(Permutations, permutation, 9, v, t); var metric = 0f; metric += LeftRightFit[v[t, 0], v[t, 1]] + LeftRightFit[v[t, 1], v[t, 2]]; metric += LeftRightFit[v[t, 3], v[t, 4]] + LeftRightFit[v[t, 4], v[t, 5]]; metric += LeftRightFit[v[t, 6], v[t, 7]] + LeftRightFit[v[t, 7], v[t, 8]]; metric += TopBottomFit[v[t, 0], v[t, 3]] + TopBottomFit[v[t, 3], v[t, 6]]; metric += TopBottomFit[v[t, 1], v[t, 4]] + TopBottomFit[v[t, 4], v[t, 7]]; metric += TopBottomFit[v[t, 2], v[t, 5]] + TopBottomFit[v[t, 5], v[t, 8]]; blockEvaluations[t].Permutation = permutation; blockEvaluations[t].Metric = metric; gThread.SyncThreads(); for (var i = 256 / 2; i > 0; i /= 2) { if (t < i) { if (blockEvaluations[t].Metric > blockEvaluations[t + i].Metric) { blockEvaluations[t] = blockEvaluations[t + i]; } } gThread.SyncThreads(); } if (gThread.threadIdx.x == 0) { evaluations[gThread.blockIdx.x] = blockEvaluations[0]; } }
public static void AverageColor(GThread thread, GPUColorBGRA[] screenshotColors, GPUColorBGRA[] colors, int[] indices, float[] output, float[] debugOutput) { //int[] cache = thread.AllocateShared<int>("cache", 1025); //thread.SyncThreads(); //int offset = thread.gridDim.x * thread.threadIdx.x + indices[thread.threadIdx.x]; //2025 * 0 + 0 -> 2025 * 0 + 1 //thread.SyncThreads(); //thread.SyncThreadsCount(true); //indices[thread.threadIdx.x]++; //(25 * ) + () * * //keysWidth = 25 * 4; //griddim x //keysHeight = 6 * 4; //griddim y //pixelPerKeyColumn = (screenWidth) / keysWidth; //blockdim.y //pixelPerKeyRow = (screenHeight) / keysHeight; //blockdim.y //int o = (thread.gridDim.x * thread.blockIdx.x + thread.threadIdx.x) + //(thread.gridDim.y * thread.blockIdx.y + thread.threadIdx.y) * thread.gridDim.x * thread.blockDim.x; ////float[] cache = thread.AllocateShared<float>("cache", screenWidth * screenHeight / blockSizeX); int o = thread.threadIdx.x /*0 bis threadcount*/ + thread.blockDim.x /*100*/ * thread.blockIdx.x + thread.blockDim.y * thread.blockIdx.y /*+1 in 1024 schritten*/; //threadIdx => Thread im Block (900) //BlockIdx => Block im Grid (100,24) //int offset = thread.gridDim.x * thread.threadIdx.x + indices[thread.threadIdx.x]++; //2025 * 0 + 0 -> 2025 * 0 + 1 //cache[offset] = -1; //output[o] = 0; //float o2; var column = o % screenWidth; var row = (o / screenWidth); var keyColumnIndex = column / pixelPerKeyColumn; var keyRowIndex = row / pixelPerKeyRow; //debugOutput[o] = keyColumnIndex * keysHeight + keyRowIndex; //if (output[0] == 0) // output[0] = thread.gridDim.x; //if(output[1] == 0) // output[1] = thread.gridDim.y; thread.SyncThreads(); debugOutput[o] = thread.blockIdx.x; //if (debugOutput[o] == 0) //{ // debugOutput[o] = thread.blockDim.x; // debugOutput[o + 1] = thread.blockDim.y; // debugOutput[o + 2] = thread.gridDim.x; // debugOutput[o + 3] = thread.gridDim.y; // debugOutput[o + 5] = thread.gridDim.y; //} output[(((keyColumnIndex * keysHeight) + keyRowIndex) * 3) + 0] += (float)screenshotColors[o].Red; output[(((keyColumnIndex * keysHeight) + keyRowIndex) * 3) + 1] += (float)screenshotColors[o].Green; output[(((keyColumnIndex * keysHeight) + keyRowIndex) * 3) + 2] += (float)screenshotColors[o].Blue; //output[thread.gridDim.x * keysHeight + thread.gridDim.y] += 10f; thread.SyncThreads(); //for (int i = 0; i < colors.Length; i++) //{ // //if (rgbColors[o].Red + 5 >= colors[i].Red && rgbColors[o].Red - 5 <= colors[i].Red // // && rgbColors[o].Green + 5 >= colors[i].Green && rgbColors[o].Green - 5 <= colors[i].Green // // && rgbColors[o].Blue + 5 >= colors[i].Blue && rgbColors[o].Blue - 5 <= colors[i].Blue) // if (screenshotColors[o].Red == colors[i].Red // && screenshotColors[o].Green == colors[i].Green // && screenshotColors[o].Blue == colors[i].Blue) // { // //cache[offset] = o; // thread.SyncThreads(); // output[o] = o; // break; // } //} //indices[thread.threadIdx.x]++; //thread.SyncThreads(); //output[offset] = cache[offset]; }
public static void RayTraceMain(GThread thread, Color3[,,] voxelMap, Color3[,] pixelMap, byte[] imageBytes, Camera[] camera, ChunkData[,,] chunkData, Color3[] voxelData, FSMUnit[] units) { // int threadIndex = thread.threadIdx.x + (thread.blockIdx.x * thread.blockDim.x);a // int strideLength = (thread.blockDim.x * thread.gridDim.x); int tx = thread.threadIdx.x + (thread.blockIdx.x * thread.blockDim.x); int ty = thread.threadIdx.y + (thread.blockIdx.y * thread.blockDim.y); if (tx > pixelMap.GetLength(0) || ty > pixelMap.GetLength(1)) { return; // out of bounds, do no work } if (tx < 64 && ty == 0) { // test unsafe { units[tx].values[0] = tx + 0; units[tx].values[1] = tx + 1; units[tx].values[2] = tx + 2; units[tx].values[3] = tx + 3; } } // camera float hRot = camera[0].hRotation; float vRot = camera[0].vRotation; vRot = Clamp(vRot, -90.0f, 90.0f); float yaw = hRot; float pitch = vRot; float cosPitch = degcos(pitch); float sinPitch = degsin(pitch); float cosYaw = degcos(yaw); float sinYaw = degsin(yaw); camera[0].rightX = cosYaw; camera[0].rightY = 0f; camera[0].rightZ = -sinYaw; camera[0].upX = sinYaw * sinPitch; camera[0].upY = cosPitch; camera[0].upZ = cosYaw * sinPitch; camera[0].forwardX = sinYaw * cosPitch; camera[0].forwardY = -sinPitch; camera[0].forwardZ = cosPitch * cosYaw; // raster coordinates (0..1, 0..1) float px = ((float)(tx + 0.5f) / (float)pixelMap.GetLength(0)); float py = ((float)(ty + 0.5f) / (float)pixelMap.GetLength(1)); float ratio = (float)pixelMap.GetLength(0) / (float)pixelMap.GetLength(1); // should be > 1.0, normalized to Y-axis of screen float FOV = 90.0f; float halfFOV = FOV / 2f; // middle of screen is 0,0 in this frame px = (px - 0.5f) * 2; // normalize: -1...+1 py = (py - 0.5f) * 2; // normalize: -1...+1 float vx = px * degtan(halfFOV) * ratio; float vy = py * degtan(halfFOV); float vz = -1.0f; float vlength = GMath.Sqrt(vx * vx + vy * vy + vz * vz); float norm_starting_x = vx / vlength; float norm_starting_y = vy / vlength; float norm_starting_z = vz / vlength; // normalized vector to rotate // normalized rotation axis float x = 0f; float y = 1f; float z = 0f; float rho_deg = hRot; float c = degcos(rho_deg); float s = degsin(rho_deg); float t = (1 - degcos(rho_deg)); float norm_final_x = norm_starting_x * (t * x * x + c) + norm_starting_y * (t * x * y - s * z) + norm_starting_z * (t * x * z + s * y); float norm_final_y = norm_starting_x * (t * x * y + s * z) + norm_starting_y * (t * y * y + c) + norm_starting_z * (t * y * z - s * x); float norm_final_z = norm_starting_x * (t * x * z - s * y) + norm_starting_y * (t * y * z + s * x) + norm_starting_z * (t * z * z + c); norm_starting_x = norm_final_x; norm_starting_y = norm_final_y; norm_starting_z = norm_final_z; // rotate relative to NEW local 'right' vector x = camera[0].rightX; y = camera[0].rightY; z = camera[0].rightZ; rho_deg = vRot; // rot_angle; c = degcos(rho_deg); s = degsin(rho_deg); t = (1 - degcos(rho_deg)); norm_final_x = norm_starting_x * (t * x * x + c) + norm_starting_y * (t * x * y - s * z) + norm_starting_z * (t * x * z + s * y); norm_final_y = norm_starting_x * (t * x * y + s * z) + norm_starting_y * (t * y * y + c) + norm_starting_z * (t * y * z - s * x); norm_final_z = norm_starting_x * (t * x * z - s * y) + norm_starting_y * (t * y * z + s * x) + norm_starting_z * (t * z * z + c); vx = norm_final_x; vy = norm_final_y; vz = norm_final_z; // normalize //vlength = GMath.Sqrt(vx * vx + vy * vy + vz * vz); //vx /= vlength; //vy /= vlength; //vz /= vlength; float rayx = camera[0].x; float rayy = camera[0].y; float rayz = camera[0].z; float red = 0f; float green = 0f; float blue = 0f; float maxDistance = 64.0f * 2f; float currentDistance = 0.0f; float rayStartX = camera[0].x; float rayStartY = camera[0].y; float rayStartZ = camera[0].z; float rayEndX = rayStartX + vx * maxDistance; float rayEndY = rayStartY + vy * maxDistance; float rayEndZ = rayStartZ + vz * maxDistance; // Bresenham3D algorithm if (false) { float x1 = GMath.Floor(rayStartX); float x2 = GMath.Floor(rayEndX); float y1 = GMath.Floor(rayStartY); float y2 = GMath.Floor(rayEndY); float z1 = GMath.Floor(rayStartZ); float z2 = GMath.Floor(rayEndZ); float dx = flabs(x2 - x1); float dy = flabs(y2 - y1); float dz = flabs(z2 - z1); float xs = 0; float ys = 0; float zs = 0; if (x2 > x1) { xs = 1; } else { xs = -1; } if (y2 > y1) { ys = 1; } else { ys = -1; } if (z2 > z1) { zs = 1; } else { zs = -1; } float p1 = 0; float p2 = 0; // Driving axis is X-axis if (dx >= dy && dx >= dz) { p1 = 2 * dy - dx; p2 = 2 * dz - dx; while (x1 != x2) { x1 += xs; if (p1 >= 0) { y1 += ys; p1 -= 2 * dx; } if (p2 >= 0) { z1 += zs; p2 -= 2 * dx; } p1 += 2 * dy; p2 += 2 * dz; // ListOfPoints.append((x1, y1, z1)) // check voxel x1, y1, z1 // chunkData = new ChunkData[17, 17, 17]; // -8 to +8, and 0, adding +8 offset; chunk size = 8 voxels (512 blocks) unsafe { int rawX = (int)(GMath.Floor(x1)); int rawY = (int)(GMath.Floor(y1)); int rawZ = (int)(GMath.Floor(z1)); } } } // Driving axis is Y-axis else if (dy >= dx && dy >= dz) { p1 = 2 * dx - dy; p2 = 2 * dz - dy; while (y1 != y2) { y1 += ys; if (p1 >= 0) { x1 += xs; p1 -= 2 * dy; } if (p2 >= 0) { z1 += zs; p2 -= 2 * dy; } p1 += 2 * dx; p2 += 2 * dz; //ListOfPoints.append((x1, y1, z1)) // check voxel x1, y1, z1 unsafe { int rawX = (int)(GMath.Floor(x1)); int rawY = (int)(GMath.Floor(y1)); int rawZ = (int)(GMath.Floor(z1)); } } } // Driving axis is Z-axis else { p1 = 2 * dy - dz; p2 = 2 * dx - dz; while (z1 != z2) { z1 += zs; if (p1 >= 0) { y1 += ys; p1 -= 2 * dz; } if (p2 >= 0) { x1 += xs; p2 -= 2 * dz; } p1 += 2 * dy; p2 += 2 * dx; //ListOfPoints.append((x1, y1, z1)) // check voxel x1, y1, z1 unsafe { int rawX = (int)(GMath.Floor(x1)); int rawY = (int)(GMath.Floor(y1)); int rawZ = (int)(GMath.Floor(z1)); } } } } if (true) { // ray cast while (currentDistance < maxDistance) { // voxel map is 0...31 index int voxelx = (int)GMath.Floor(rayx); int voxely = (int)GMath.Floor(rayy); int voxelz = (int)GMath.Floor(rayz); int chunkX = voxelx / 8 + 16; // +16 in the array dimension int chunkY = voxely / 8 + 16; int chunkZ = voxelz / 8 + 16; int chunkInternalX = voxelx & 7; // 0,0,0 to 7,7,7 int chunkInternalY = voxely & 7; int chunkInternalZ = voxelz & 7; if (chunkX < 0 || chunkX > 32 || chunkY < 0 || chunkY > 32 || chunkZ < 0 || chunkZ > 32) { // out of camera bounds, ignore } else { if (chunkData[chunkX, chunkY, chunkZ].empty == 0) { int index = chunkData[chunkX, chunkY, chunkZ].voxelDataIndex; int adjIndex = index + (chunkInternalX + chunkInternalY * 8 + chunkInternalZ * 64); if (voxelData[adjIndex].red > 0f || voxelData[adjIndex].green > 0f || voxelData[adjIndex].blue > 0f) { red = voxelData[adjIndex].red; green = voxelData[adjIndex].green; blue = voxelData[adjIndex].blue; break; } } } // inner x/y/z of cube volume float ix = rayx - GMath.Floor(rayx); float iy = rayy - GMath.Floor(rayy); float iz = rayz - GMath.Floor(rayz); // get dist remaining in cube axis if (vx > 0) { ix = 1f - ix; } if (vy > 0) { iy = 1f - iy; } if (vz > 0) { iz = 1f - iz; } ix = flabs(ix / vx); iy = flabs(iy / vy); iz = flabs(iz / vz); float nextDistance = GMath.Min(iz, GMath.Min(ix, iy)) + 0.01f; // step just over boundary rayx += vx * nextDistance; rayy += vy * nextDistance; rayz += vz * nextDistance; currentDistance += nextDistance; // add step length } } // render a pixel to buffer int imageByteIndex = (tx + (ty * pixelMap.GetLength(0))) * 4; imageBytes[imageByteIndex + 0] = (byte)(blue * 255f); // blue imageBytes[imageByteIndex + 1] = (byte)(green * 255f); // green imageBytes[imageByteIndex + 2] = (byte)(red * 255f); // red imageBytes[imageByteIndex + 3] = 255; // alpha channel int L = pixelMap.GetLength(0); pixelMap[tx, ty].red = vx; pixelMap[tx, ty].green = vy; pixelMap[tx, ty].blue = vz; return; pixelMap[tx, ty].red = 1.0f * ((float)tx / (float)pixelMap.GetLength(0)); pixelMap[tx, ty].blue = 1.0f * ((float)ty / (float)pixelMap.GetLength(1)); imageByteIndex = (tx + (ty * pixelMap.GetLength(0))) * 4; imageBytes[imageByteIndex + 0] = (byte)(pixelMap[tx, ty].red * 255); // + (rnd * 255)); test dynamic imageBytes[imageByteIndex + 1] = (byte)(pixelMap[tx, ty].green * 255); imageBytes[imageByteIndex + 2] = (byte)(pixelMap[tx, ty].blue * 255); imageBytes[imageByteIndex + 3] = 255; // alpha channel thread.SyncThreads(); return; }
private static void GpuConv2DInputGradient(GThread thread, float[] gradient, float[] rotKernels, float[,] resultPartials, GpuShape[] shapes, int paddingX, int paddingY, int stride) { /* * for (int n = 0; n < gradients.BatchSize; ++n) * for (int outW = 0, w = -paddingX; outW < inputGradients.Width; w += stride, ++outW) * for (int outH = 0, h = -paddingY; outH < inputGradients.Height; h += stride, ++outH) * for (int outD = 0; outD < inputGradients.Depth; ++outD) * { * for (int kernelN = 0; kernelN < rotKernels.BatchSize; ++kernelN) * for (int kernelH = 0; kernelH < rotKernels.Height; ++kernelH) * for (int kernelW = 0; kernelW < rotKernels.Width; ++kernelW) * inputGradients[outW, outH, outD, n] += gradients.TryGet(0, w + kernelW, h + kernelH, kernelN, n) * rotKernels[kernelW, kernelH, outD, kernelN]; * } */ // this shared memory will store partial sums that later on will be reduced float[] sdata = thread.AllocateShared <float>("sdata", THREADS_PER_BLOCK); int resultElemId = thread.blockIdx.x; int tid = thread.threadIdx.x; int id = (thread.blockDim.x * thread.blockIdx.y) + thread.threadIdx.x; int threadsRequiredPerResultElem = shapes[1].BatchSize * shapes[1].Height * shapes[1].Width; int outN = shapes[2].GetBatch(resultElemId); int outD = shapes[2].GetDepth(resultElemId); int outH = shapes[2].GetHeight(resultElemId); int outW = shapes[2].GetWidth(resultElemId); int kernelN = shapes[3].GetBatch(id); int kernelH = shapes[3].GetHeight(id); int kernelW = shapes[3].GetWidth(id); int h = -paddingY + stride * outH; int w = -paddingX + stride * outW; float temp = 0; if (id < threadsRequiredPerResultElem) { int gradientIndex = shapes[0].TryGetIndex(w + kernelW, h + kernelH, kernelN, outN); if (gradientIndex >= 0) { temp = gradient[gradientIndex] * rotKernels[shapes[1].GetIndex(kernelW, kernelH, outD, kernelN)]; } } sdata[tid] = temp; thread.SyncThreads(); int i = thread.blockDim.x / 2; while (i != 0) { if (tid < i) { sdata[tid] += sdata[tid + i]; } thread.SyncThreads(); i /= 2; } if (tid == 0) { resultPartials[thread.blockIdx.x, thread.blockIdx.y] = sdata[0]; } }
public static void EdgeFilterKernel(GThread thread, byte[] source, int stride, int imageWidth, int imageHeight, int filterSize) { int x = thread.blockIdx.x * thread.blockDim.x + thread.threadIdx.x; int y = thread.blockIdx.y * thread.blockDim.y + thread.threadIdx.y; if (x >= imageWidth || y >= imageHeight) { return; } var index = 3 * x + y * stride; //take the pixel above, left, right, below and compare to the middle pixel var colorDifference = 0.0; var xLeft = x - filterSize; var xRight = x + filterSize; var yUp = y - filterSize; var yDown = y + filterSize; var middleB = source[x * 3 + y * stride]; var middleG = source[x * 3 + y * stride + 1]; var middleR = source[x * 3 + y * stride + 2]; if (xLeft >= 0) { colorDifference += GMath.Abs(middleB - source[xLeft * 3 + y * stride]) + GMath.Abs(middleG - source[xLeft * 3 + y * stride + 1]) + GMath.Abs(middleR - source[xLeft * 3 + y * stride + 2]); } if (xRight < imageWidth) { colorDifference += GMath.Abs(middleB - source[xRight * 3 + y * stride]) + GMath.Abs(middleG - source[xRight * 3 + y * stride + 1]) + GMath.Abs(middleR - source[xRight * 3 + y * stride + 2]); } if (yUp >= 0) { colorDifference += GMath.Abs(middleB - source[x * 3 + yUp * stride]) + GMath.Abs(middleG - source[x * 3 + yUp * stride + 1]) + GMath.Abs(middleR - source[x * 3 + yUp * stride + 2]); } if (yDown < imageHeight) { colorDifference += GMath.Abs(middleB - source[x * 3 + yDown * stride]) + GMath.Abs(middleG - source[x * 3 + yDown * stride + 1]) + GMath.Abs(middleR - source[x * 3 + yDown * stride + 2]); } //This will only work within blocks, pixels across block edge based on scheduling //That's ok for this task, as it will have small impact thread.SyncThreads(); source[index] = source[index + 1] = source[index + 2] = (byte)(colorDifference / 12); }
public static void ComputeFitsKernel(GThread gThread, int edgeIndexA, int edgeIndexB, float[,] fit) { var sum = gThread.AllocateShared<float>("sum", 64); var tileIndexA = gThread.blockIdx.x; var tileIndexB = gThread.blockIdx.y; var pixelIndex = gThread.threadIdx.x; var diff = Edges[tileIndexA, edgeIndexA, pixelIndex] - Edges[tileIndexB, edgeIndexB, pixelIndex]; sum[pixelIndex] = diff * diff; gThread.SyncThreads(); for (var i = 64 / 2; i > 0; i /= 2) { if (pixelIndex < i) { sum[pixelIndex] += sum[pixelIndex + i]; } gThread.SyncThreads(); } if (pixelIndex == 0) { fit[tileIndexA, tileIndexB] = sum[0]; } }
public static void AverageColorDifferenceKernel(GThread thread, byte[] source, int sourceStride, int sourceWidth, int sourceHeight, byte[] pattern, int patternStride, int patternWidth, int patternHeight, int sourceStartX, int sourceStartY, int[,] blockSums) { //Store px sums in shared memory var pixelSums = thread.AllocateShared <int>("pixelSums", (int)Filters.BlockSideLength, (int)Filters.BlockSideLength); pixelSums[thread.threadIdx.x, thread.threadIdx.y] = 0; int x = thread.blockIdx.x * thread.blockDim.x + thread.threadIdx.x; int y = thread.blockIdx.y * thread.blockDim.y + thread.threadIdx.y; if (x < patternWidth && y < patternHeight) { var sourceIndex = 3 * (x + sourceStartX) + (y + sourceStartY) * sourceStride; var patternIndex = 3 * x + y * patternStride; //Ignore if close to black if (pattern[patternIndex] > 3) { pixelSums[thread.threadIdx.x, thread.threadIdx.y] = Distance(pattern[patternIndex], source[sourceIndex]) + Distance(pattern[patternIndex + 1], source[sourceIndex + 1]) + Distance(pattern[patternIndex + 2], source[sourceIndex + 2]); } } //Wait till all block threads have finished thread.SyncThreads(); //Use the first thread of each block to add up the block sums. CPU will need to add up the grid sum. if (thread.threadIdx.x == 0 && thread.threadIdx.y == 0) { int blockSum = 0; //Add up the pixel sums to a block sum for (var ty = 0; ty < (int)Filters.BlockSideLength; ty++) { for (var tx = 0; tx < (int)Filters.BlockSideLength; tx++) { blockSum += pixelSums[tx, ty]; } } //Store the block's sum blockSums[thread.blockIdx.x, thread.blockIdx.y] = blockSum; } //var window = (int)Filters.BlockSideLength / 2; //while (window > 0) //{ // if (thread.threadIdx.x < window) // pixelSums[thread.threadIdx.x, thread.threadIdx.y] += pixelSums[thread.threadIdx.x+window, thread.threadIdx.y]; // window /= 2; // thread.SyncThreads(); //} //if (thread.threadIdx.x == 0 && thread.threadIdx.y == 0) //{ // int blockSum = 0; // for (var ty = 0; ty < (int)Filters.BlockSideLength; ty++) // blockSum += pixelSums[0, ty]; // //Store the block's sum // blockSums[thread.blockIdx.x, thread.blockIdx.y] = blockSum; //} }
public static void VectorAdd(GThread thread, [CudafyAddressSpace(eCudafyAddressSpace.Global)] int[] a, int[] b, int[] c ) { int[] shared = thread.AllocateShared<int>("shared", Program.N); int index = thread.threadIdx.x + thread.blockIdx.x * thread.blockDim.x; //int index = thread.get_local_id(0); c[index] = (a[index] + b[index]) * ConstantMemory[index]; thread.SyncThreads(); }
public static void PeaksCompare_Dalton(GThread thread, double[] leftInsilicoMasses, double[] rightInsilicoMasses, double[] insilicoMassLeftAo, double[] insilicoMassLeftAstar, double[] insilicoMassLeftBo, double[] insilicoMassLeftBstar, double[] insilicoMassRightYo, double[] insilicoMassRightYstar, double[] insilicoMassRightZo, double[] insilicoMassRightZoo, double[] peakListMasses, double[] differencesArray, int[] matchesFound, int[] arraySize, int[] numberOfMatches, double[] Tolerance, double[] modifiedIonsArray) { int tidx = thread.blockIdx.x * thread.blockDim.x + thread.threadIdx.x; int Iindex = tidx % leftInsilicoMasses.Length; int Pindex = (tidx / leftInsilicoMasses.Length) % peakListMasses.Length; double peak = peakListMasses[Pindex]; int planeSize = leftInsilicoMasses.Length * peakListMasses.Length; // modified array index *** if (tidx < planeSize) { differencesArray[tidx] = leftInsilicoMasses[Iindex] - peak; if (differencesArray[tidx] < 0) { differencesArray[tidx] = differencesArray[tidx] * -1; } if (differencesArray[tidx] < Tolerance[0]) { matchesFound[tidx] = 1; } else { matchesFound[tidx] = 0; } thread.SyncThreads(); tidx += planeSize; differencesArray[tidx] = rightInsilicoMasses[Iindex] - peak; if (differencesArray[tidx] < 0) { differencesArray[tidx] = differencesArray[tidx] * -1; } if (differencesArray[tidx] < Tolerance[0]) { matchesFound[tidx] = 1; } else { matchesFound[tidx] = 0; } thread.SyncThreads(); tidx += planeSize; differencesArray[tidx] = insilicoMassLeftAo[Iindex] - peak; if (differencesArray[tidx] < 0) { differencesArray[tidx] = differencesArray[tidx] * -1; } if (differencesArray[tidx] < Tolerance[0]) { matchesFound[tidx] = 1; } else { matchesFound[tidx] = 0; } thread.SyncThreads(); tidx += planeSize; differencesArray[tidx] = insilicoMassLeftAstar[Iindex] - peak; if (differencesArray[tidx] < 0) { differencesArray[tidx] = differencesArray[tidx] * -1; } if (differencesArray[tidx] < Tolerance[0]) { matchesFound[tidx] = 1; } else { matchesFound[tidx] = 0; } thread.SyncThreads(); tidx += planeSize; differencesArray[tidx] = insilicoMassLeftBo[Iindex] - peak; if (differencesArray[tidx] < 0) { differencesArray[tidx] = differencesArray[tidx] * -1; } if (differencesArray[tidx] < Tolerance[0]) { matchesFound[tidx] = 1; } else { matchesFound[tidx] = 0; } thread.SyncThreads(); tidx += planeSize; differencesArray[tidx] = insilicoMassLeftBstar[Iindex] - peak; if (differencesArray[tidx] < 0) { differencesArray[tidx] = differencesArray[tidx] * -1; } if (differencesArray[tidx] < Tolerance[0]) { matchesFound[tidx] = 1; } else { matchesFound[tidx] = 0; } thread.SyncThreads(); tidx += planeSize; differencesArray[tidx] = insilicoMassRightYo[Iindex] - peak; if (differencesArray[tidx] < 0) { differencesArray[tidx] = differencesArray[tidx] * -1; } if (differencesArray[tidx] < Tolerance[0]) { matchesFound[tidx] = 1; } else { matchesFound[tidx] = 0; } thread.SyncThreads(); tidx += planeSize; differencesArray[tidx] = insilicoMassRightYstar[Iindex] - peak; if (differencesArray[tidx] < 0) { differencesArray[tidx] = differencesArray[tidx] * -1; } if (differencesArray[tidx] < Tolerance[0]) { matchesFound[tidx] = 1; } else { matchesFound[tidx] = 0; } thread.SyncThreads(); tidx += planeSize; differencesArray[tidx] = insilicoMassRightZo[Iindex] - peak; if (differencesArray[tidx] < 0) { differencesArray[tidx] = differencesArray[tidx] * -1; } if (differencesArray[tidx] < Tolerance[0]) { matchesFound[tidx] = 1; } else { matchesFound[tidx] = 0; } thread.SyncThreads(); tidx += planeSize; differencesArray[tidx] = insilicoMassRightZoo[Iindex] - peak; if (differencesArray[tidx] < 0) { differencesArray[tidx] = differencesArray[tidx] * -1; } if (differencesArray[tidx] < Tolerance[0]) { matchesFound[tidx] = 1; } else { matchesFound[tidx] = 0; } } }