Exemplo n.º 1
0
            static void MultiplyBlockUnrollHx16(half *Ap, int Astride, half *Bp, int Bstride, half *Sp, int Sstride)
            {
                for (int i = 0; i < blockSize; i++)
                {
                    float sum0 = *(Sp + i + Sstride * 0);
                    float sum1 = *(Sp + i + Sstride * 1);
                    float sum2 = *(Sp + i + Sstride * 2);
                    float sum3 = *(Sp + i + Sstride * 3);
                    float sum4 = *(Sp + i + Sstride * 4);
                    float sum5 = *(Sp + i + Sstride * 5);
                    float sum6 = *(Sp + i + Sstride * 6);
                    float sum7 = *(Sp + i + Sstride * 7);
                    float sum8 = *(Sp + i + Sstride * 8);
                    float sum9 = *(Sp + i + Sstride * 9);
                    float sumA = *(Sp + i + Sstride * 10);
                    float sumB = *(Sp + i + Sstride * 11);
                    float sumC = *(Sp + i + Sstride * 12);
                    float sumD = *(Sp + i + Sstride * 13);
                    float sumE = *(Sp + i + Sstride * 14);
                    float sumF = *(Sp + i + Sstride * 15);

                    for (int l = 0; l < blockSize; l++)
                    {
                        float A = *(Ap + i + Astride * l);

                        float B0 = *(Bp + l * Bstride + 0);
                        float B1 = *(Bp + l * Bstride + 1);
                        float B2 = *(Bp + l * Bstride + 2);
                        float B3 = *(Bp + l * Bstride + 3);
                        float B4 = *(Bp + l * Bstride + 4);
                        float B5 = *(Bp + l * Bstride + 5);
                        float B6 = *(Bp + l * Bstride + 6);
                        float B7 = *(Bp + l * Bstride + 7);
                        float B8 = *(Bp + l * Bstride + 8);
                        float B9 = *(Bp + l * Bstride + 9);
                        float BA = *(Bp + l * Bstride + 10);
                        float BB = *(Bp + l * Bstride + 11);
                        float BC = *(Bp + l * Bstride + 12);
                        float BD = *(Bp + l * Bstride + 13);
                        float BE = *(Bp + l * Bstride + 14);
                        float BF = *(Bp + l * Bstride + 15);


                        sum0 += A * B0;
                        sum1 += A * B1;
                        sum2 += A * B2;
                        sum3 += A * B3;
                        sum4 += A * B4;
                        sum5 += A * B5;
                        sum6 += A * B6;
                        sum7 += A * B7;
                        sum8 += A * B8;
                        sum9 += A * B9;
                        sumA += A * BA;
                        sumB += A * BB;
                        sumC += A * BC;
                        sumD += A * BD;
                        sumE += A * BE;
                        sumF += A * BF;
                    }

                    *(Sp + i + Sstride * 0)  = (half)(sum0);
                    *(Sp + i + Sstride * 1)  = (half)(sum1);
                    *(Sp + i + Sstride * 2)  = (half)(sum2);
                    *(Sp + i + Sstride * 3)  = (half)(sum3);
                    *(Sp + i + Sstride * 4)  = (half)(sum4);
                    *(Sp + i + Sstride * 5)  = (half)(sum5);
                    *(Sp + i + Sstride * 6)  = (half)(sum6);
                    *(Sp + i + Sstride * 7)  = (half)(sum7);
                    *(Sp + i + Sstride * 8)  = (half)(sum8);
                    *(Sp + i + Sstride * 9)  = (half)(sum9);
                    *(Sp + i + Sstride * 10) = (half)(sumA);
                    *(Sp + i + Sstride * 11) = (half)(sumB);
                    *(Sp + i + Sstride * 12) = (half)(sumC);
                    *(Sp + i + Sstride * 13) = (half)(sumD);
                    *(Sp + i + Sstride * 14) = (half)(sumE);
                    *(Sp + i + Sstride * 15) = (half)(sumF);
                }
            }
Exemplo n.º 2
0
            public void Execute(int y)
            {
                int   accumulatorMemSize = data.kernelCount * sizeof(half);
                half *outputAccumulators = (half *)UnsafeUtility.Malloc(accumulatorMemSize, JobsUtility.CacheLineSize, Allocator.TempJob);

                for (int n = 0; n < data.outBatch; ++n)
                {
                    for (int x = 0; x < data.outWidth; ++x)
                    {
                        // reset accumulators to 0
                        UnsafeUtility.MemClear(outputAccumulators, accumulatorMemSize);

                        // gather X * K results in accumulators
                        for (int dy = 0; dy < data.kernelHeight; ++dy)
                        {
                            int readY = y * data.strideY + dy - data.padY;
                            if (readY < 0)
                            {
                                continue;
                            }
                            if (readY >= data.inHeight)
                            {
                                continue;
                            }

                            for (int dx = 0; dx < data.kernelWidth; ++dx)
                            {
                                int readX = x * data.strideX + dx - data.padY;
                                if (readX < 0)
                                {
                                    continue;
                                }
                                if (readX >= data.inWidth)
                                {
                                    continue;
                                }

                                half *dst    = outputAccumulators;
                                half *src    = Xptr + n * data.inStrideN + readY * data.inStrideH + readX * data.inStrideW;
                                half *kernel = Sptr + dy * data.kernelStrideH + dx * data.kernelStrideW;

                                int k = 0;
                                for (; k < data.kernelCount - unrollSize + 1; k += unrollSize) // unroll of kernelCount loop
                                {
                                    for (int q = 0; q < unrollSize; q++, src++, dst++, kernel++)
                                    {
                                        *dst += (half)((*src) * (*kernel));
                                    }
                                }
                                for (; k < data.kernelCount; k++, src++, dst++, kernel++) // remainder of kernelCount loop
                                {
                                    *dst += (half)((*src) * (*kernel));
                                }
                            }
                        }

                        { // write accumulators to memory and add bias
                            int   k    = 0;
                            half *src  = outputAccumulators;
                            half *dst  = Optr + n * data.outStrideN + y * data.outStrideH + x * data.outStrideW;
                            half *bias = Bptr;
                            for (; k < data.kernelCount - unrollSize + 1; k += unrollSize) // unroll of kernelCount loop
                            {
                                for (int q = 0; q < unrollSize; q++, src++, dst++, bias++)
                                {
                                    *dst = (half)((*src) + (*bias));
                                }
                            }
                            for (; k < data.kernelCount; k++, src++, dst++, bias++) // remainder of kernelCount loop
                            {
                                *dst = (half)((*src) + (*bias));
                            }
                        }
                    }
                }

                UnsafeUtility.Free(outputAccumulators, Allocator.TempJob);
            }
Exemplo n.º 3
0
            public void Execute(int threadID)
            {
                half *A  = this.Xptr;
                half *B  = this.Sptr;
                half *C  = this.Bptr;
                half *S  = this.Optr;
                int   AM = data.AM;
                int   BM = data.BM;
                int   SM = data.SM;
                int   AN = data.AN;
                int   BN = data.BN;
                int   SN = data.SN;

                int dispatchThreadXY = data.dispatchThreadX * data.dispatchThreadY;

                int batch = (threadID / dispatchThreadXY);
                int i     = (threadID % dispatchThreadXY) % data.dispatchThreadX;
                int j     = (threadID % dispatchThreadXY) / data.dispatchThreadX;

                int batchOffSetA = (batch * AM * AN);
                int batchOffSetS = (batch * SM * SN);

                int rowA = i * blockSize;
                int colB = j * blockSize;

                unsafe
                {
                    half *blockTempA = null;
                    half *blockTempB = null;
                    half *blockTempS = null;

                    half *blockS  = S + rowA + SM * colB + batchOffSetS;
                    int   strideS = SM;

                    if (rowA + blockSize > SM || colB + blockSize > SN) // copy remainder of C into zero-padded block
                    {
                        blockTempS = AllocBlockHalf(blockSize, blockSize);
                        strideS    = blockSize;
                        blockS     = blockTempS;
                    }
                    for (int y = 0; y < blockSize; y++)
                    {
                        for (int x = 0; x < blockSize; x++)
                        {
                            blockS[x + strideS * y] = (half)((colB + y) < BN ? C[colB + y] : 0.0f);
                        }
                    }

                    for (int l = 0; l < AN; l += blockSize) // inner-loop
                    {
                        half *blockA  = A + rowA + AM * l + batchOffSetA;
                        half *blockB  = B + l * BN + colB;
                        int   strideA = AM;
                        int   strideB = BN;

                        if (rowA + blockSize > AM || l + blockSize > AN) // copy remainder of A into zero-padded block
                        {
                            if (blockTempA == null)
                            {
                                blockTempA = AllocBlockHalf(blockSize, blockSize);
                            }
                            strideA = blockSize;

                            for (int y = 0; y < blockSize; y++)
                            {
                                for (int x = 0; x < blockSize; x++)
                                {
                                    blockTempA[x + blockSize * y] = (half)(((rowA + x) < AM && (l + y < AN)) ? blockA[x + AM * y] : 0.0f);
                                }
                            }

                            blockA = blockTempA;
                        }

                        if (colB + blockSize > BN || l + blockSize > BM) // copy remainder of B into zero-padded block
                        {
                            if (blockTempB == null)
                            {
                                blockTempB = AllocBlockHalf(blockSize, blockSize);
                            }
                            strideB = blockSize;

                            for (int y = 0; y < blockSize; y++)
                            {
                                for (int x = 0; x < blockSize; x++)
                                {
                                    blockTempB[x + blockSize * y] = (half)(((colB + x) < BN && (l + y < BM)) ? blockB[x + BN * y] : 0.0f);
                                }
                            }

                            blockB = blockTempB;
                        }

                        MultiplyBlockUnrollHx16(blockA, strideA, blockB, strideB, blockS, strideS);
                    }

                    if (blockS == blockTempS) // copy back
                    {
                        for (int y = 0; y < blockSize; y++)
                        {
                            for (int x = 0; x < blockSize; x++)
                            {
                                if (((rowA + x) < SM) && ((colB + y) < SN))
                                {
                                    S[(rowA + x) + SM * (colB + y) + batchOffSetS] = blockTempS[x + blockSize * y];
                                }
                            }
                        }
                    }

                    FreeBlock(blockTempA);
                    FreeBlock(blockTempB);
                    FreeBlock(blockTempS);
                }
            }
        private static void PackHeightmap(Texture2D[,] allHeightmap, int textureSize, int lodLevel, string path, string terrainName)
        {
            ComputeShader  shader     = Resources.Load <ComputeShader>("MipmapCompute");
            ComputeBuffer  readBuffer = new ComputeBuffer(textureSize * textureSize, sizeof(float));
            MStringBuilder sb         = new MStringBuilder(path.Length + terrainName.Length + 15);

            sb.Add(path);
            if (path[path.Length - 1] != '/')
            {
                sb.Add("/");
            }
            sb.Add(terrainName);
            path += terrainName;
            if (Directory.Exists(sb.str))
            {
                Directory.Delete(sb.str);
            }
            Directory.CreateDirectory(sb.str);
            int pathLength = sb.str.Length;

            for (int i = 0; i < lodLevel; ++i)
            {
                sb.Resize(pathLength);
                sb.Add("/LOD" + i.ToString());
                Directory.CreateDirectory(sb.str);
            }
            sb.Resize(pathLength);
            sb.Add("/LOD0");
            for (int x = 0; x < allHeightmap.GetLength(0); ++x)
            {
                for (int y = 0; y < allHeightmap.GetLength(1); ++y)
                {
                    Texture2D tex = allHeightmap[x, y];
                    if (tex.width != textureSize ||
                        tex.height != textureSize)
                    {
                        readBuffer.Dispose();
                        Resources.UnloadAsset(shader);
                        throw new System.Exception("Texture " + tex.name + " setting is not right!(Width, Height, isReadable)");
                    }
                }
            }
            shader.SetBuffer(1, "_OutputBuffer", readBuffer);
            float[] result = new float[textureSize * textureSize];
            void SaveTexture(StreamWriter writer, Texture tex)
            {
                shader.SetTexture(1, ShaderIDs._MainTex, tex);
                int kernelSize = Mathf.CeilToInt(textureSize / 8f);

                shader.Dispatch(1, kernelSize, kernelSize, 1);
                readBuffer.GetData(result);
                char[] chrArray = new char[result.Length * sizeof(half)];
                half * arrPtr   = (half *)chrArray.Ptr();

                for (int i = 0; i < result.Length; ++i)
                {
                    arrPtr[i] = (half)result[i];
                }
                writer.Write(chrArray);
            }

            using (StreamWriter writer = new StreamWriter(sb.str))
            {
                for (int x = 0; x < allHeightmap.GetLength(0); ++x)
                {
                    for (int y = 0; y < allHeightmap.GetLength(1); ++y)
                    {
                        Texture2D tex = allHeightmap[x, y];
                        SaveTexture(writer, tex);
                    }
                }
            }
            readBuffer.Dispose();
            Resources.UnloadAsset(shader);
        }