Tensor Conv2DWinograd(Tensor X, Tensor K, Tensor B, Tensor O, int[] stride, int[] pad) { Assert.AreEqual(X.channels, K.kernelDepth); Assert.AreEqual(K.kernelCount, B.flatWidth); Assert.AreEqual(B.flatWidth, B.length); Assert.AreEqual(stride.Length, 2); Assert.AreEqual(pad.Length, 4); // Winograd // transform kernel TensorShape Kws = new TensorShape(K.batch + 1, K.height + 1, K.width, K.channels); var fn_wk = new ComputeFunc(m_Kernels, "KernelWinograd_3x3"); fn_wk.SetTensor("X", K.shape, Pin(K).buffer); var Kw = Dispatch(fn_wk, Kws, K.kernelCount, X.channels, 1); var fn_w = new ComputeFunc(m_Kernels, "Conv2DWinograd_2x2_3x3"); SetTensor(fn_w, "X", X); SetTensor(fn_w, "K", Kw); SetTensor(fn_w, "B", B); fn_w.shader.SetInts("_Pad", pad); var OW = Dispatch(fn_w, O.shape, Kw.kernelCount, IDivC(O.width, 2), IDivC(O.height, 2)); return(OW); }
public static ComputeKernel BestKernel(ComputeShader[] kernels, ComputeKernelLibrary.Entry[] entrees, bool verbose) { var bestEntry = entrees[0]; var bestScore = long.MaxValue; for (int i = 0; i < entrees.Length; i++) { var score = CalculateEntryScore(kernels, entrees[i], verbose); if (score < bestScore) { bestEntry = entrees[i]; bestScore = score; } } if (verbose) { D.Log(bestEntry.name); } var func = new ComputeFunc(kernels, bestEntry.name); if (bestEntry.loopStride > 0) { int preferedDispatch = (int)bestEntry.loopStride * (int)func.threadGroupSizeX; var kernel = new ComputeKernel(func, new int[] { preferedDispatch, 1, 1 }); kernel.shader.SetInt("_LoopStride", preferedDispatch); return(kernel); } else { return(new ComputeKernel(func, bestEntry.dispatch)); } }
internal static long CalculateEntryScore(ComputeShader[] kernels, ComputeKernelLibrary.Entry entry, bool verbose) { const long InvalidEntry = long.MaxValue; long work = InvalidEntry; try { if (!entry.valid) { return(InvalidEntry); } // @TODO: @OPTIMIZE: cache threadGroupSize instead of creating ComputeFunc and querying every time var fn = new ComputeFunc(kernels, entry.name); if (fn.threadGroupSizeX * fn.threadGroupSizeY * fn.threadGroupSizeZ > ComputeInfo.maxComputeWorkGroupSize) { return(InvalidEntry); } if (entry.strict) { if (entry.dispatch[0] % fn.threadGroupSizeX != 0 || entry.dispatch[1] % fn.threadGroupSizeY != 0 || entry.dispatch[2] % fn.threadGroupSizeZ != 0) { return(InvalidEntry); } } var x = (long)ComputeFunc.IntDivCeil(entry.dispatch[0], (int)fn.threadGroupSizeX); var y = (long)ComputeFunc.IntDivCeil(entry.dispatch[1], (int)fn.threadGroupSizeY); var z = (long)ComputeFunc.IntDivCeil(entry.dispatch[2], (int)fn.threadGroupSizeZ); if (entry.loopStride == 0 && (x > 65535 || y > 65535 || z > 65535)) { if (verbose) { D.LogWarning($"Kernel {entry.name} dispatch arguments out of range (any [{x},{y},{z}] > 65535), skipping.."); } return(InvalidEntry); } work = x * y * z; work *= (int)fn.threadGroupSize; work = (long)(entry.bigO * work); } catch (ArgumentException) { if (verbose) { D.LogWarning($"Kernel processing failed, skipping {entry.name}"); } } return(work); }
public static ComputeKernel BestKernel(ComputeShader[] kernels, ComputeKernelLibrary.Entry[] entrees, bool verbose) { var bestEntry = entrees[0]; var bestScore = InvalidEntry; bool foundKernelWithDevicePriority = false; for (int i = 0; i < entrees.Length; i++) { var score = CalculateEntryScore(kernels, entrees[i], verbose); bool entryDevicePriority = entrees[i].devicePriority; if (score == InvalidEntry) { continue; } // first time we encounter a kernel with device priority if (!foundKernelWithDevicePriority && entryDevicePriority) { bestScore = score; bestEntry = entrees[i]; } // compute best entry: sort only on priority kernels (if some exist), else sort on non priority else if ((!foundKernelWithDevicePriority && !entryDevicePriority) || (foundKernelWithDevicePriority && entryDevicePriority)) { bestScore = (score <= bestScore) ? score : bestScore; bestEntry = (score <= bestScore) ? entrees[i] : bestEntry; } foundKernelWithDevicePriority = foundKernelWithDevicePriority || entryDevicePriority; } if (verbose) { D.Log(bestEntry.name); } var func = new ComputeFunc(kernels, bestEntry.name); if (bestEntry.loopStride > 0) { int preferedDispatch = (int)bestEntry.loopStride * (int)func.threadGroupSizeX; var kernel = new ComputeKernel(func, new int[] { preferedDispatch, 1, 1 }); kernel.shader.SetInt("_LoopStride", preferedDispatch); return(kernel); } else { return(new ComputeKernel(func, bestEntry.dispatch)); } }
internal static DispatchInfo CreateFromComputeFunc(ComputeFunc computeFunc, int x, int y, int z) { var backend = computeFunc.computeShaderContext == ComputeShaderContext.Reference?"REF":"OPT"; return(new DispatchInfo(backend, computeFunc.kernelName, x, y, z)); }
public ComputeKernel(ComputeFunc func_, int[] dispatch_) { func = func_; dispatch = dispatch_; }