public static void histo_kernel(GThread thread, byte[] buffer, int size, uint[] histo)
        {
            // clear out the accumulation buffer called temp
            // since we are launched with 256 threads, it is easy
            // to clear that memory with one write per thread
            uint[] temp = thread.AllocateShared <uint>("temp", 256);
            temp[thread.threadIdx.x] = 0;
            thread.SyncThreads();

            // calculate the starting index and the offset to the next
            // block that each thread will be processing
            int i      = thread.threadIdx.x + thread.blockIdx.x * thread.blockDim.x;
            int stride = thread.blockDim.x * thread.gridDim.x;

            while (i < size)
            {
                thread.atomicAdd(ref temp[buffer[i]], 1);
                i += stride;
            }
            // sync the data from the above writes to shared memory
            // then add the shared memory values to the values from
            // the other thread blocks using global memory
            // atomic adds
            // same as before, since we have 256 threads, updating the
            // global histogram is just one write per thread!
            thread.SyncThreads();

            thread.atomicAdd(ref (histo[thread.threadIdx.x]), temp[thread.threadIdx.x]);
        }
Example #2
0
        public static void histo_kernel(GThread thread, byte[] buffer, int size, uint[] histo) 
        {
            // clear out the accumulation buffer called temp
            // since we are launched with 256 threads, it is easy
            // to clear that memory with one write per thread
            uint[] temp = thread.AllocateShared<uint>("temp", 256);
            temp[thread.threadIdx.x] = 0;
            thread.SyncThreads();

            // calculate the starting index and the offset to the next
            // block that each thread will be processing
            int i = thread.threadIdx.x + thread.blockIdx.x * thread.blockDim.x;
            int stride = thread.blockDim.x * thread.gridDim.x;
            while (i < size) 
            {
                thread.atomicAdd(ref temp[buffer[i]], 1 );
                i += stride;
            }
            // sync the data from the above writes to shared memory
            // then add the shared memory values to the values from
            // the other thread blocks using global memory
            // atomic adds
            // same as before, since we have 256 threads, updating the
            // global histogram is just one write per thread!
            thread.SyncThreads();

            thread.atomicAdd(ref (histo[thread.threadIdx.x]), temp[thread.threadIdx.x]);
        }
Example #3
0
        public static void MultiplySparseGPU2(GThread thread, int kernelCount, int[] indicesA, float[] valuesA, int nonzeroCountA, int colCountA, float[] B, int colCountB, float[] C, int transposeA)
        {
            var index = (thread.blockIdx.x * thread.blockDim.x) + thread.threadIdx.x;

            while (index < kernelCount)
            {
                var colB       = index % colCountB;
                var arrayIndex = index / colCountB;
                var value      = valuesA[arrayIndex];
                var indexA     = indicesA[arrayIndex];
                var rowA       = indexA / colCountA;
                var colA       = indexA % colCountA;
                if (transposeA != 0)
                {
                    var tmp = rowA;
                    rowA = colA;
                    colA = tmp;
                }

                var valB   = B[colA * colCountB + colB];
                var mul    = value * valB;
                var indexC = rowA * colCountB + colB;

                thread.atomicAdd(ref C[indexC], mul);
                index += thread.blockDim.x * thread.gridDim.x;
            }
        }
Example #4
0
        public static void PFACAnalyse(GThread thread, byte[] buffer, int initialState, int[,] lookup, int[] targetEndLength, uint[] resultCount, int[] foundCount, byte[] foundID, int[] foundSOF)
        {
            int n = buffer.Length;

            int i      = thread.threadIdx.x + thread.blockIdx.x * thread.blockDim.x; // Counter for i
            int stride = thread.blockDim.x * thread.gridDim.x;                       // Stride is the next byte for the thread to go to

            for (; i < n; i += stride)                                               // Loop to scan full file segment
            {
                int state = initialState;
                int pos   = i;

                while (pos < n)
                {
                    state = lookup[state, buffer[pos]];
                    if (state == 0)
                    {
                        break;
                    }
                    if (state < initialState)
                    {
                        if ((state - 1) % 2 == 0)
                        {
                            thread.atomicAdd(ref resultCount[(int)((state + 1) / 2) - 1], 1);
                            int counter = thread.atomicAdd(ref foundCount[0], 1);
                            foundID[counter]  = (byte)state;
                            foundSOF[counter] = i;
                        }
                        else
                        {
                            int fileEnd = i + targetEndLength[((state + 1) / 2) - 1];
                            if (buffer[fileEnd] != 0x38 && buffer[fileEnd + 1] != 0x38 && buffer[fileEnd + 1] != 0x3B)
                            {
                                int counter = thread.atomicAdd(ref foundCount[0], 1);
                                foundID[counter]  = (byte)state;
                                foundSOF[counter] = i;
                            }
                        }
                    }
                    pos++;
                }
            }

            thread.SyncThreads();                                                   // Sync GPU threads
        }
Example #5
0
        public static void LiczSumyWarstwyGPU(GThread watek, int[] warstwa, float[,] macierzDelt, float[,] macierzSum, float[, ,] neuron)
        {
            int x = watek.blockIdx.x;
            int y = watek.blockIdx.y;

            watek.atomicAdd(ref macierzSum[warstwa[0], x], macierzDelt[warstwa[0] + 1, y] * neuron[warstwa[0] + 1, y, x]);
            //macierzSum[warstwa[0], x] += macierzDelt[warstwa[0] + 1, y] * neuron[warstwa[0] + 1, y, x];
            watek.SyncThreads();
        }
Example #6
0
        public static void UzupelnijWarstwe(GThread watek, float[, ,] neuron, float[,] macierzWejsc, float[,] macierzWyjsc, int[] warstwa)
        {
            int x = watek.blockIdx.x;
            int y = watek.blockIdx.y;

            watek.atomicAdd(ref macierzWyjsc[warstwa[0], x], neuron[warstwa[0], x, y] * macierzWejsc[warstwa[0], y]);
            macierzWejsc[warstwa[0] + 1, x] = macierzWyjsc[warstwa[0], x];
            watek.SyncThreads();
        }
Example #7
0
        public static void LiczWarstweGPU(GThread watek, float[, ,] neuron, float[] wyjscie, int[] warstwa, float[,] macierzWejsc, float[,] macierzSum)
        {
            int x = watek.blockIdx.x;
            int y = watek.blockIdx.y;

            watek.atomicAdd(ref wyjscie[x], neuron[warstwa[0], x, y] * macierzWejsc[warstwa[0], y]);
            watek.SyncThreads();
            //wyjscie[x] += neuron[warstwa[0], x, y] * macierzWejsc[warstwa[0], y];
        }
Example #8
0
        public static void UaktualnijWagiNeuronowGPU(GThread watek, float[, ,] neuron, float[,] macierzDelt, float[,] macierzWejsc, float[] stala)
        {
            int x   = watek.blockIdx.x;
            int y   = watek.blockIdx.y;
            int z   = 0;
            int len = neuron.GetLength(2);

            //neuron[x, y, z] += stala[0] * macierzDelt[x, y] * macierzWejsc[x, z];
            while (z < len)
            {
                watek.atomicAdd(ref neuron[x, y, z], stala[0] * macierzDelt[x, y] * macierzWejsc[x, z]);
                z++;
            }
            watek.SyncThreads();
            //neuron[x, y, z+x] = 1;
        }
Example #9
0
        public static void atomicsTestUInt32(GThread thread, uint[] input, uint[] output)
        {
            int i = 0;
            int x = 0;

            output[i++] = thread.atomicAdd(ref input[x], 42);       // 42
            output[i++] = thread.atomicSub(ref input[x], 21);       // 21
            output[i++] = thread.atomicIncEx(ref input[x]);         // 22
            output[i++] = thread.atomicIncEx(ref input[x]);         // 23
            output[i++] = thread.atomicMax(ref input[x], 50);       // 50
            output[i++] = thread.atomicMin(ref input[x], 40);       // 40
            output[i++] = thread.atomicOr(ref input[x], 16);        // 56
            output[i++] = thread.atomicAnd(ref input[x], 15);       // 8
            output[i++] = thread.atomicXor(ref input[x], 15);       // 7
            output[i++] = thread.atomicExch(ref input[x], 88);      // 88
            output[i++] = thread.atomicCAS(ref input[x], 88, 123);  // 123
            output[i++] = thread.atomicCAS(ref input[x], 321, 222); // 123
            output[i++] = thread.atomicDecEx(ref input[x]);         // 122
        }
Example #10
0
 public static void atomicsTestUInt32(GThread thread, uint[] input, uint[] output)
 {
     int i = 0;
     int x = 0;
     output[i++] = thread.atomicAdd(ref input[x], 42); // 42
     output[i++] = thread.atomicSub(ref input[x], 21); // 21
     output[i++] = thread.atomicIncEx(ref input[x]);   // 22
     output[i++] = thread.atomicIncEx(ref input[x]);   // 23
     output[i++] = thread.atomicMax(ref input[x], 50); // 50
     output[i++] = thread.atomicMin(ref input[x], 40); // 40
     output[i++] = thread.atomicOr(ref input[x], 16);  // 56
     output[i++] = thread.atomicAnd(ref input[x], 15); // 8
     output[i++] = thread.atomicXor(ref input[x], 15); // 7
     output[i++] = thread.atomicExch(ref input[x], 88);// 88
     output[i++] = thread.atomicCAS(ref input[x], 88, 123);// 123
     output[i++] = thread.atomicCAS(ref input[x], 321, 222);// 123
     output[i++] = thread.atomicDecEx(ref input[x]);   // 122
 }
Example #11
0
        public static void MultiplySparseGPU2(GThread thread, int kernelCount, int[] indicesA, float[] valuesA, int nonzeroCountA, int colCountA, float[] B, int colCountB, float[] C, int transposeA)
        {
            var index = (thread.blockIdx.x * thread.blockDim.x) + thread.threadIdx.x;
            while (index < kernelCount)
            {
                var colB = index % colCountB;
                var arrayIndex = index / colCountB;
                var value = valuesA[arrayIndex];
                var indexA = indicesA[arrayIndex];
                var rowA = indexA / colCountA;
                var colA = indexA % colCountA;
                if (transposeA != 0)
                {
                    var tmp = rowA;
                    rowA = colA;
                    colA = tmp;
                }

                var valB = B[colA * colCountB + colB];
                var mul = value * valB;
                var indexC = rowA * colCountB + colB;

                thread.atomicAdd(ref C[indexC], mul);
                index += thread.blockDim.x * thread.gridDim.x;
            }
        }