public static void Total(FloatResidentArray a, int N, float[] total) { var cache = new SharedMemoryAllocator <float>().allocate(blockDim.x); int tid = threadIdx.x + blockDim.x * blockIdx.x; int cacheIndex = threadIdx.x; float sum = 0f; while (tid < N) { sum = sum + a[tid]; tid += blockDim.x * gridDim.x; } cache[cacheIndex] = sum; CUDAIntrinsics.__syncthreads(); int i = blockDim.x / 2; while (i != 0) { if (cacheIndex < i) { cache[cacheIndex] = cache[cacheIndex] + cache[cacheIndex + i]; } CUDAIntrinsics.__syncthreads(); i >>= 1; } if (cacheIndex == 0) { AtomicExpr.apply(ref total[0], cache[0], (x, y) => x + y); } }
public virtual bool VisitAtomicExpr(AtomicExpr stmt) { if (!VisitExpr(stmt)) { return(false); } return(true); }
public Word Visit(AtomicExpr atomicExpr) { var result = atomicExpr.Value.Accept(this); if (IsError(result)) { return(result); } //PyObj pyObj;//Quitar comentario de esta linea si se desea pasar por "copia de referencia" los argumentos (i.e. como en java) if (result == null) { return(ErrorFactory.VoidExpr(atomicExpr)); } //Quitar comentario de este bloque si se desea pasar por "copia de referencia" los argumentos (i.e. como en java) //if (IsMemoryBlock(result)) // pyObj = ((MemoryBlock)result).Value; //else // pyObj = (PyObj)result; return(result); }
private static void ScalarProd(float[] result, FloatResidentArray r1, FloatResidentArray r2, int N) { var cache = new SharedMemoryAllocator <float>().allocate(blockDim.x); int tid = threadIdx.x + blockDim.x * blockIdx.x; int cacheIndex = threadIdx.x; float tmp = 0.0F; while (tid < N) { tmp += r1[tid] * r2[tid]; tid += blockDim.x * gridDim.x; } cache[cacheIndex] = tmp; CUDAIntrinsics.__syncthreads(); int i = blockDim.x / 2; while (i != 0) { if (cacheIndex < i) { cache[cacheIndex] += cache[cacheIndex + i]; } CUDAIntrinsics.__syncthreads(); i >>= 1; } if (cacheIndex == 0) { AtomicExpr.apply(ref result[0], cache[0], (x, y) => x + y); } }
public static void InnerReduce(float[] result, float[] input, int N, float neutral, Func <float, float, float> reductor) { var cache = new SharedMemoryAllocator <float>().allocate(blockDim.x); int tid = threadIdx.x + blockDim.x * blockIdx.x; int cacheIndex = threadIdx.x; float tmp = neutral; while (tid < N) { tmp = reductor(tmp, input[tid]); tid += blockDim.x * gridDim.x; } cache[cacheIndex] = tmp; CUDAIntrinsics.__syncthreads(); int i = blockDim.x / 2; while (i != 0) { if (cacheIndex < i) { cache[cacheIndex] = reductor(cache[cacheIndex], cache[cacheIndex + i]); } CUDAIntrinsics.__syncthreads(); i >>= 1; } if (cacheIndex == 0) { AtomicExpr.apply(ref result[0], cache[0], reductor); } }
public bool VisitAtomicExpr(AtomicExpr stmt) { throw new NotImplementedException(); }