static void MultiplyBlockUnrollHx16(half *Ap, int Astride, half *Bp, int Bstride, half *Sp, int Sstride) { for (int i = 0; i < blockSize; i++) { float sum0 = *(Sp + i + Sstride * 0); float sum1 = *(Sp + i + Sstride * 1); float sum2 = *(Sp + i + Sstride * 2); float sum3 = *(Sp + i + Sstride * 3); float sum4 = *(Sp + i + Sstride * 4); float sum5 = *(Sp + i + Sstride * 5); float sum6 = *(Sp + i + Sstride * 6); float sum7 = *(Sp + i + Sstride * 7); float sum8 = *(Sp + i + Sstride * 8); float sum9 = *(Sp + i + Sstride * 9); float sumA = *(Sp + i + Sstride * 10); float sumB = *(Sp + i + Sstride * 11); float sumC = *(Sp + i + Sstride * 12); float sumD = *(Sp + i + Sstride * 13); float sumE = *(Sp + i + Sstride * 14); float sumF = *(Sp + i + Sstride * 15); for (int l = 0; l < blockSize; l++) { float A = *(Ap + i + Astride * l); float B0 = *(Bp + l * Bstride + 0); float B1 = *(Bp + l * Bstride + 1); float B2 = *(Bp + l * Bstride + 2); float B3 = *(Bp + l * Bstride + 3); float B4 = *(Bp + l * Bstride + 4); float B5 = *(Bp + l * Bstride + 5); float B6 = *(Bp + l * Bstride + 6); float B7 = *(Bp + l * Bstride + 7); float B8 = *(Bp + l * Bstride + 8); float B9 = *(Bp + l * Bstride + 9); float BA = *(Bp + l * Bstride + 10); float BB = *(Bp + l * Bstride + 11); float BC = *(Bp + l * Bstride + 12); float BD = *(Bp + l * Bstride + 13); float BE = *(Bp + l * Bstride + 14); float BF = *(Bp + l * Bstride + 15); sum0 += A * B0; sum1 += A * B1; sum2 += A * B2; sum3 += A * B3; sum4 += A * B4; sum5 += A * B5; sum6 += A * B6; sum7 += A * B7; sum8 += A * B8; sum9 += A * B9; sumA += A * BA; sumB += A * BB; sumC += A * BC; sumD += A * BD; sumE += A * BE; sumF += A * BF; } *(Sp + i + Sstride * 0) = (half)(sum0); *(Sp + i + Sstride * 1) = (half)(sum1); *(Sp + i + Sstride * 2) = (half)(sum2); *(Sp + i + Sstride * 3) = (half)(sum3); *(Sp + i + Sstride * 4) = (half)(sum4); *(Sp + i + Sstride * 5) = (half)(sum5); *(Sp + i + Sstride * 6) = (half)(sum6); *(Sp + i + Sstride * 7) = (half)(sum7); *(Sp + i + Sstride * 8) = (half)(sum8); *(Sp + i + Sstride * 9) = (half)(sum9); *(Sp + i + Sstride * 10) = (half)(sumA); *(Sp + i + Sstride * 11) = (half)(sumB); *(Sp + i + Sstride * 12) = (half)(sumC); *(Sp + i + Sstride * 13) = (half)(sumD); *(Sp + i + Sstride * 14) = (half)(sumE); *(Sp + i + Sstride * 15) = (half)(sumF); } }
public void Execute(int y) { int accumulatorMemSize = data.kernelCount * sizeof(half); half *outputAccumulators = (half *)UnsafeUtility.Malloc(accumulatorMemSize, JobsUtility.CacheLineSize, Allocator.TempJob); for (int n = 0; n < data.outBatch; ++n) { for (int x = 0; x < data.outWidth; ++x) { // reset accumulators to 0 UnsafeUtility.MemClear(outputAccumulators, accumulatorMemSize); // gather X * K results in accumulators for (int dy = 0; dy < data.kernelHeight; ++dy) { int readY = y * data.strideY + dy - data.padY; if (readY < 0) { continue; } if (readY >= data.inHeight) { continue; } for (int dx = 0; dx < data.kernelWidth; ++dx) { int readX = x * data.strideX + dx - data.padY; if (readX < 0) { continue; } if (readX >= data.inWidth) { continue; } half *dst = outputAccumulators; half *src = Xptr + n * data.inStrideN + readY * data.inStrideH + readX * data.inStrideW; half *kernel = Sptr + dy * data.kernelStrideH + dx * data.kernelStrideW; int k = 0; for (; k < data.kernelCount - unrollSize + 1; k += unrollSize) // unroll of kernelCount loop { for (int q = 0; q < unrollSize; q++, src++, dst++, kernel++) { *dst += (half)((*src) * (*kernel)); } } for (; k < data.kernelCount; k++, src++, dst++, kernel++) // remainder of kernelCount loop { *dst += (half)((*src) * (*kernel)); } } } { // write accumulators to memory and add bias int k = 0; half *src = outputAccumulators; half *dst = Optr + n * data.outStrideN + y * data.outStrideH + x * data.outStrideW; half *bias = Bptr; for (; k < data.kernelCount - unrollSize + 1; k += unrollSize) // unroll of kernelCount loop { for (int q = 0; q < unrollSize; q++, src++, dst++, bias++) { *dst = (half)((*src) + (*bias)); } } for (; k < data.kernelCount; k++, src++, dst++, bias++) // remainder of kernelCount loop { *dst = (half)((*src) + (*bias)); } } } } UnsafeUtility.Free(outputAccumulators, Allocator.TempJob); }
public void Execute(int threadID) { half *A = this.Xptr; half *B = this.Sptr; half *C = this.Bptr; half *S = this.Optr; int AM = data.AM; int BM = data.BM; int SM = data.SM; int AN = data.AN; int BN = data.BN; int SN = data.SN; int dispatchThreadXY = data.dispatchThreadX * data.dispatchThreadY; int batch = (threadID / dispatchThreadXY); int i = (threadID % dispatchThreadXY) % data.dispatchThreadX; int j = (threadID % dispatchThreadXY) / data.dispatchThreadX; int batchOffSetA = (batch * AM * AN); int batchOffSetS = (batch * SM * SN); int rowA = i * blockSize; int colB = j * blockSize; unsafe { half *blockTempA = null; half *blockTempB = null; half *blockTempS = null; half *blockS = S + rowA + SM * colB + batchOffSetS; int strideS = SM; if (rowA + blockSize > SM || colB + blockSize > SN) // copy remainder of C into zero-padded block { blockTempS = AllocBlockHalf(blockSize, blockSize); strideS = blockSize; blockS = blockTempS; } for (int y = 0; y < blockSize; y++) { for (int x = 0; x < blockSize; x++) { blockS[x + strideS * y] = (half)((colB + y) < BN ? C[colB + y] : 0.0f); } } for (int l = 0; l < AN; l += blockSize) // inner-loop { half *blockA = A + rowA + AM * l + batchOffSetA; half *blockB = B + l * BN + colB; int strideA = AM; int strideB = BN; if (rowA + blockSize > AM || l + blockSize > AN) // copy remainder of A into zero-padded block { if (blockTempA == null) { blockTempA = AllocBlockHalf(blockSize, blockSize); } strideA = blockSize; for (int y = 0; y < blockSize; y++) { for (int x = 0; x < blockSize; x++) { blockTempA[x + blockSize * y] = (half)(((rowA + x) < AM && (l + y < AN)) ? blockA[x + AM * y] : 0.0f); } } blockA = blockTempA; } if (colB + blockSize > BN || l + blockSize > BM) // copy remainder of B into zero-padded block { if (blockTempB == null) { blockTempB = AllocBlockHalf(blockSize, blockSize); } strideB = blockSize; for (int y = 0; y < blockSize; y++) { for (int x = 0; x < blockSize; x++) { blockTempB[x + blockSize * y] = (half)(((colB + x) < BN && (l + y < BM)) ? blockB[x + BN * y] : 0.0f); } } blockB = blockTempB; } MultiplyBlockUnrollHx16(blockA, strideA, blockB, strideB, blockS, strideS); } if (blockS == blockTempS) // copy back { for (int y = 0; y < blockSize; y++) { for (int x = 0; x < blockSize; x++) { if (((rowA + x) < SM) && ((colB + y) < SN)) { S[(rowA + x) + SM * (colB + y) + batchOffSetS] = blockTempS[x + blockSize * y]; } } } } FreeBlock(blockTempA); FreeBlock(blockTempB); FreeBlock(blockTempS); } }
private static void PackHeightmap(Texture2D[,] allHeightmap, int textureSize, int lodLevel, string path, string terrainName) { ComputeShader shader = Resources.Load <ComputeShader>("MipmapCompute"); ComputeBuffer readBuffer = new ComputeBuffer(textureSize * textureSize, sizeof(float)); MStringBuilder sb = new MStringBuilder(path.Length + terrainName.Length + 15); sb.Add(path); if (path[path.Length - 1] != '/') { sb.Add("/"); } sb.Add(terrainName); path += terrainName; if (Directory.Exists(sb.str)) { Directory.Delete(sb.str); } Directory.CreateDirectory(sb.str); int pathLength = sb.str.Length; for (int i = 0; i < lodLevel; ++i) { sb.Resize(pathLength); sb.Add("/LOD" + i.ToString()); Directory.CreateDirectory(sb.str); } sb.Resize(pathLength); sb.Add("/LOD0"); for (int x = 0; x < allHeightmap.GetLength(0); ++x) { for (int y = 0; y < allHeightmap.GetLength(1); ++y) { Texture2D tex = allHeightmap[x, y]; if (tex.width != textureSize || tex.height != textureSize) { readBuffer.Dispose(); Resources.UnloadAsset(shader); throw new System.Exception("Texture " + tex.name + " setting is not right!(Width, Height, isReadable)"); } } } shader.SetBuffer(1, "_OutputBuffer", readBuffer); float[] result = new float[textureSize * textureSize]; void SaveTexture(StreamWriter writer, Texture tex) { shader.SetTexture(1, ShaderIDs._MainTex, tex); int kernelSize = Mathf.CeilToInt(textureSize / 8f); shader.Dispatch(1, kernelSize, kernelSize, 1); readBuffer.GetData(result); char[] chrArray = new char[result.Length * sizeof(half)]; half * arrPtr = (half *)chrArray.Ptr(); for (int i = 0; i < result.Length; ++i) { arrPtr[i] = (half)result[i]; } writer.Write(chrArray); } using (StreamWriter writer = new StreamWriter(sb.str)) { for (int x = 0; x < allHeightmap.GetLength(0); ++x) { for (int y = 0; y < allHeightmap.GetLength(1); ++y) { Texture2D tex = allHeightmap[x, y]; SaveTexture(writer, tex); } } } readBuffer.Dispose(); Resources.UnloadAsset(shader); }