public static void histo_kernel(GThread thread, byte[] buffer, int size, uint[] histo) { // clear out the accumulation buffer called temp // since we are launched with 256 threads, it is easy // to clear that memory with one write per thread uint[] temp = thread.AllocateShared <uint>("temp", 256); temp[thread.threadIdx.x] = 0; thread.SyncThreads(); // calculate the starting index and the offset to the next // block that each thread will be processing int i = thread.threadIdx.x + thread.blockIdx.x * thread.blockDim.x; int stride = thread.blockDim.x * thread.gridDim.x; while (i < size) { thread.atomicAdd(ref temp[buffer[i]], 1); i += stride; } // sync the data from the above writes to shared memory // then add the shared memory values to the values from // the other thread blocks using global memory // atomic adds // same as before, since we have 256 threads, updating the // global histogram is just one write per thread! thread.SyncThreads(); thread.atomicAdd(ref (histo[thread.threadIdx.x]), temp[thread.threadIdx.x]); }
public static void histo_kernel(GThread thread, byte[] buffer, int size, uint[] histo) { // clear out the accumulation buffer called temp // since we are launched with 256 threads, it is easy // to clear that memory with one write per thread uint[] temp = thread.AllocateShared<uint>("temp", 256); temp[thread.threadIdx.x] = 0; thread.SyncThreads(); // calculate the starting index and the offset to the next // block that each thread will be processing int i = thread.threadIdx.x + thread.blockIdx.x * thread.blockDim.x; int stride = thread.blockDim.x * thread.gridDim.x; while (i < size) { thread.atomicAdd(ref temp[buffer[i]], 1 ); i += stride; } // sync the data from the above writes to shared memory // then add the shared memory values to the values from // the other thread blocks using global memory // atomic adds // same as before, since we have 256 threads, updating the // global histogram is just one write per thread! thread.SyncThreads(); thread.atomicAdd(ref (histo[thread.threadIdx.x]), temp[thread.threadIdx.x]); }
public static void MultiplySparseGPU2(GThread thread, int kernelCount, int[] indicesA, float[] valuesA, int nonzeroCountA, int colCountA, float[] B, int colCountB, float[] C, int transposeA) { var index = (thread.blockIdx.x * thread.blockDim.x) + thread.threadIdx.x; while (index < kernelCount) { var colB = index % colCountB; var arrayIndex = index / colCountB; var value = valuesA[arrayIndex]; var indexA = indicesA[arrayIndex]; var rowA = indexA / colCountA; var colA = indexA % colCountA; if (transposeA != 0) { var tmp = rowA; rowA = colA; colA = tmp; } var valB = B[colA * colCountB + colB]; var mul = value * valB; var indexC = rowA * colCountB + colB; thread.atomicAdd(ref C[indexC], mul); index += thread.blockDim.x * thread.gridDim.x; } }
public static void PFACAnalyse(GThread thread, byte[] buffer, int initialState, int[,] lookup, int[] targetEndLength, uint[] resultCount, int[] foundCount, byte[] foundID, int[] foundSOF) { int n = buffer.Length; int i = thread.threadIdx.x + thread.blockIdx.x * thread.blockDim.x; // Counter for i int stride = thread.blockDim.x * thread.gridDim.x; // Stride is the next byte for the thread to go to for (; i < n; i += stride) // Loop to scan full file segment { int state = initialState; int pos = i; while (pos < n) { state = lookup[state, buffer[pos]]; if (state == 0) { break; } if (state < initialState) { if ((state - 1) % 2 == 0) { thread.atomicAdd(ref resultCount[(int)((state + 1) / 2) - 1], 1); int counter = thread.atomicAdd(ref foundCount[0], 1); foundID[counter] = (byte)state; foundSOF[counter] = i; } else { int fileEnd = i + targetEndLength[((state + 1) / 2) - 1]; if (buffer[fileEnd] != 0x38 && buffer[fileEnd + 1] != 0x38 && buffer[fileEnd + 1] != 0x3B) { int counter = thread.atomicAdd(ref foundCount[0], 1); foundID[counter] = (byte)state; foundSOF[counter] = i; } } } pos++; } } thread.SyncThreads(); // Sync GPU threads }
public static void LiczSumyWarstwyGPU(GThread watek, int[] warstwa, float[,] macierzDelt, float[,] macierzSum, float[, ,] neuron) { int x = watek.blockIdx.x; int y = watek.blockIdx.y; watek.atomicAdd(ref macierzSum[warstwa[0], x], macierzDelt[warstwa[0] + 1, y] * neuron[warstwa[0] + 1, y, x]); //macierzSum[warstwa[0], x] += macierzDelt[warstwa[0] + 1, y] * neuron[warstwa[0] + 1, y, x]; watek.SyncThreads(); }
public static void UzupelnijWarstwe(GThread watek, float[, ,] neuron, float[,] macierzWejsc, float[,] macierzWyjsc, int[] warstwa) { int x = watek.blockIdx.x; int y = watek.blockIdx.y; watek.atomicAdd(ref macierzWyjsc[warstwa[0], x], neuron[warstwa[0], x, y] * macierzWejsc[warstwa[0], y]); macierzWejsc[warstwa[0] + 1, x] = macierzWyjsc[warstwa[0], x]; watek.SyncThreads(); }
public static void LiczWarstweGPU(GThread watek, float[, ,] neuron, float[] wyjscie, int[] warstwa, float[,] macierzWejsc, float[,] macierzSum) { int x = watek.blockIdx.x; int y = watek.blockIdx.y; watek.atomicAdd(ref wyjscie[x], neuron[warstwa[0], x, y] * macierzWejsc[warstwa[0], y]); watek.SyncThreads(); //wyjscie[x] += neuron[warstwa[0], x, y] * macierzWejsc[warstwa[0], y]; }
public static void UaktualnijWagiNeuronowGPU(GThread watek, float[, ,] neuron, float[,] macierzDelt, float[,] macierzWejsc, float[] stala) { int x = watek.blockIdx.x; int y = watek.blockIdx.y; int z = 0; int len = neuron.GetLength(2); //neuron[x, y, z] += stala[0] * macierzDelt[x, y] * macierzWejsc[x, z]; while (z < len) { watek.atomicAdd(ref neuron[x, y, z], stala[0] * macierzDelt[x, y] * macierzWejsc[x, z]); z++; } watek.SyncThreads(); //neuron[x, y, z+x] = 1; }
public static void atomicsTestUInt32(GThread thread, uint[] input, uint[] output) { int i = 0; int x = 0; output[i++] = thread.atomicAdd(ref input[x], 42); // 42 output[i++] = thread.atomicSub(ref input[x], 21); // 21 output[i++] = thread.atomicIncEx(ref input[x]); // 22 output[i++] = thread.atomicIncEx(ref input[x]); // 23 output[i++] = thread.atomicMax(ref input[x], 50); // 50 output[i++] = thread.atomicMin(ref input[x], 40); // 40 output[i++] = thread.atomicOr(ref input[x], 16); // 56 output[i++] = thread.atomicAnd(ref input[x], 15); // 8 output[i++] = thread.atomicXor(ref input[x], 15); // 7 output[i++] = thread.atomicExch(ref input[x], 88); // 88 output[i++] = thread.atomicCAS(ref input[x], 88, 123); // 123 output[i++] = thread.atomicCAS(ref input[x], 321, 222); // 123 output[i++] = thread.atomicDecEx(ref input[x]); // 122 }
public static void atomicsTestUInt32(GThread thread, uint[] input, uint[] output) { int i = 0; int x = 0; output[i++] = thread.atomicAdd(ref input[x], 42); // 42 output[i++] = thread.atomicSub(ref input[x], 21); // 21 output[i++] = thread.atomicIncEx(ref input[x]); // 22 output[i++] = thread.atomicIncEx(ref input[x]); // 23 output[i++] = thread.atomicMax(ref input[x], 50); // 50 output[i++] = thread.atomicMin(ref input[x], 40); // 40 output[i++] = thread.atomicOr(ref input[x], 16); // 56 output[i++] = thread.atomicAnd(ref input[x], 15); // 8 output[i++] = thread.atomicXor(ref input[x], 15); // 7 output[i++] = thread.atomicExch(ref input[x], 88);// 88 output[i++] = thread.atomicCAS(ref input[x], 88, 123);// 123 output[i++] = thread.atomicCAS(ref input[x], 321, 222);// 123 output[i++] = thread.atomicDecEx(ref input[x]); // 122 }