Example #1
0
    private static unsafe void V128ForLoop([NoAlias] u8 *src, [NoAlias] u32 *dst, int count)
    {
        // Input:
        //
        // 0123 4567 8901 2345
        // RGBR GBRG BRGB RGBR
        // Output:
        // RGBA RGBA RGBA RGBA

        var shuffle = setr_epi8(0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, -1, 9, 10, 11, -1);
        var alpha   = set1_epi32(0xFF << 24);

        int i            = 0;
        var alignedCount = count & ~3;

        for (; i < alignedCount; i += 4)
        {
            var v0 = loadu_ps(src + i * 3);
            var v1 = shuffle_epi8(v0, shuffle);
            var v2 = or_ps(v1, alpha);
            storeu_ps((dst + i), v2);
        }

        for (; i < count; i++)
        {
            dst[i] = (u32)src[i * 3 + 0] << 0 |
                     (u32)src[i * 3 + 1] << 8 |
                     (u32)src[i * 3 + 2] << 16 |
                     (u32)0xFF << 24;
        }
    }
 private static unsafe void BurstForLoop([NoAlias] u8 *src, [NoAlias] float4 *dst, int count)
 {
     for (int i = 0; i < count; i++)
     {
         dst[i] = new float4(src[i * 3 + 0] / 255.0f, src[i * 3 + 1] / 255.0f, src[i * 3 + 2] / 255.0f, 1.0f);
     }
 }
Example #3
0
 private static unsafe void BurstForLoop([NoAlias] u8 *src, [NoAlias] u32 *dst, int count)
 {
     for (int i = 0; i < count; i++)
     {
         dst[i] = (u32)src[i * 3 + 0] << 0 |
                  (u32)src[i * 3 + 1] << 8 |
                  (u32)src[i * 3 + 2] << 16 |
                  (u32)0xFF << 24;
     }
 }
    private static unsafe void V128ForLoop([NoAlias] u8 *src, [NoAlias] float4 *dst, int count)
    {
        var alignedCount = (count / 5) * 5;
        var alpha        = set1_epi32(0xFF << 24);
        var _255f        = set1_ps(255.0f);
        int i            = 0;

        for (; i < alignedCount; i += 5)
        {
            var all = loadu_ps(src + i * 3);

            var v0 = srli_si128(all, 0);
            var v1 = srli_si128(all, 3);
            var v2 = srli_si128(all, 6);
            var v3 = srli_si128(all, 9);
            var v4 = srli_si128(all, 12);

            v0 = or_ps(v0, alpha);
            v1 = or_ps(v1, alpha);
            v2 = or_ps(v2, alpha);
            v3 = or_ps(v3, alpha);
            v4 = or_ps(v4, alpha);

            v0 = cvtepu8_epi32(v0);
            v1 = cvtepu8_epi32(v1);
            v2 = cvtepu8_epi32(v2);
            v3 = cvtepu8_epi32(v3);
            v4 = cvtepu8_epi32(v4);

            v0 = cvtepi32_ps(v0);
            v1 = cvtepi32_ps(v1);
            v2 = cvtepi32_ps(v2);
            v3 = cvtepi32_ps(v3);
            v4 = cvtepi32_ps(v4);

            v0 = div_ps(v0, _255f);
            v1 = div_ps(v1, _255f);
            v2 = div_ps(v2, _255f);
            v3 = div_ps(v3, _255f);
            v4 = div_ps(v4, _255f);

            storeu_ps(dst + i + 0, v0);
            storeu_ps(dst + i + 1, v1);
            storeu_ps(dst + i + 2, v2);
            storeu_ps(dst + i + 3, v3);
            storeu_ps(dst + i + 4, v4);
        }

        for (; i < count; i++)
        {
            dst[i] = new float4(src[i * 3 + 0] / 255.0f, src[i * 3 + 1] / 255.0f, src[i * 3 + 2] / 255.0f, 1.0f);
        }
    }
Example #5
0
    private static unsafe void V256ForLoop([NoAlias] u8 *src, [NoAlias] u32 *dst, int count)
    {
        // Input
        // u128     0                   1
        // u64      0         1         2         3
        // u32      0    1    2    3    4    5    6    7
        // u16      0 1  2 3  4 5  6 7  8 9  0 1  2 3  4 5
        // u8       0123 4567 8901 2345 6789 0123 4567 8901
        //          RGBR GBRG BRGB RGBR GBRG BRGB ---- ----
        // Output
        //          RGBA RGBA RGBA RGBA RGBA RGBA RBGA RGBA
        // Path
        // v0 =     0123 4567 89AB CDEF GHIJ KLMN ---- ----
        // v1 =     0123 4567 89AB ---- CDEF GHIJ KLMN ----
        // v2 =     012- 345- 678- 9AB- CDE- FGH- IJK- LMN-
        // v3 =     012α 345α 678α 9ABα CDEα FGHα IJKα LMNα

        var alignedCount = count & ~7;
        var permute      = mm256_setr_epi32(0, 1, 2, 0xFF, 3, 4, 5, 0xFF);
        var shuffleV128  = setr_epi8(0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, -1, 9, 10, 11, -1);
        var shuffleV256  = mm256_setr_m128(shuffleV128, shuffleV128);
        var alpha        = mm256_set1_epi32(0xFF << 24);
        int i            = 0;

        for (; i < alignedCount; i += 8)
        {
            var v0 = mm256_loadu_ps(src + i * 3);
            var v1 = mm256_permutevar8x32_epi32(v0, permute);
            var v2 = mm256_shuffle_epi8(v1, shuffleV256);
            var v3 = mm256_or_ps(v2, alpha);
            mm256_storeu_ps(dst + i, v3);
        }

        for (; i < count; i++)
        {
            dst[i] = (u32)src[i * 3 + 0] << 0 |
                     (u32)src[i * 3 + 1] << 8 |
                     (u32)src[i * 3 + 2] << 16 |
                     (u32)0xFF << 24;
        }
    }
Example #6
0
/* Read a page from the write-ahead log, if it is present. */
        int sqlite3WalRead(Wal *pWal, Pgno pgno, int *pInWal, int nOut, u8 *pOut);
    private static unsafe void V256ForLoop([NoAlias] u8 *src, [NoAlias] float4 *dst, int count)
    {
        // Input:
        //  u128    0                   1
        //  u64     0         1         2         3
        //  u32     0    1    2    3    4    5    6    7
        //  u16     0 1  2 3  4 5  6 7  8 9  0 1  2 3  4 5
        //  u8      0123 4567 8901 2345 6789 0123 4567 8901
        //          RGBR GBRG BRGB RGBR GBRG BRGB RGBR GBRG
        //          0  1   2   3   4  5   6   7   8  9
        // Registers:
        //          0123 4567 89AB CDEF GHIJ KLMN OPQR ST--
        // v0       0123 45-- ---- ---- ---- ---- ---- ---- << 0, lo
        // v1       6789 AB-- ---- ---- ---- ---- ---- ---- << 6, lo
        // v2       CDEF GH-- ---- ---- ---- ---- ---- ---- permute_epi32(3, 4, -1, -1, -1)
        // v3       IJKL MN-- ---- ---- ---- ---- ---- ---- << 2, hi
        // v4       OPQR ST-- ---- ---- ---- ---- ---- ---- << 8, hi
        //
        // Path each register takes after isolating 8 values we're working on.
        //
        // α = 255
        //          0123  45--  ----  ----  ----  ----  ----  ----
        //          0120  3450  ----  ----  ----  ----  ----  ----
        //          012α  345α  ----  ----  ----  ----  ----  ----
        //          0     1     2     α     3     4     5     α
        //          0f    1f    2f    αf    3f    4f    5f    αf
        //          0f/αf 1f/αf 2f/αf αf/αf 3f/αf 4f/αf 5f/αf αf/αf
        //

        var shuffleV128 = setr_epi8(0, 1, 2, -1, 3, 4, 5, -1, -1, -1, -1, -1, -1, -1, -1, -1);
        var shuffleV256 = mm256_setr_m128(shuffleV128, shuffleV128);

        var alpha        = mm256_set1_epi32(0xFF << 24);
        var alignedCount = (count / 10) * 10;
        var _1div255     = mm256_rcp_ps(mm256_set1_ps(255.0f));
        int i            = 0;

        for (; i < alignedCount; i += 10)
        {
            var all = mm256_loadu_ps(src + i * 3);
            var v0  = mm256_srli_si256(all, 0);
            var v1  = mm256_srli_si256(all, 6);
            var v2  = mm256_permutevar8x32_epi32(all, mm256_setr_epi32(3, 4, -1, -1, -1, -1, -1, -1));
            var v3  = mm256_srli_si256(all, 2);
            var v4  = mm256_srli_si256(all, 8);

            v0 = mm256_shuffle_epi8(v0, shuffleV256);
            v1 = mm256_shuffle_epi8(v1, shuffleV256);
            v2 = mm256_shuffle_epi8(v2, shuffleV256);
            v3 = mm256_shuffle_epi8(v3, shuffleV256);
            v4 = mm256_shuffle_epi8(v4, shuffleV256);

            v0 = mm256_or_ps(v0, alpha);
            v1 = mm256_or_ps(v1, alpha);
            v2 = mm256_or_ps(v2, alpha);
            v3 = mm256_or_ps(v3, alpha);
            v4 = mm256_or_ps(v4, alpha);

            v0 = mm256_cvtepu8_epi32(v0.Lo128);
            v1 = mm256_cvtepu8_epi32(v1.Lo128);
            v2 = mm256_cvtepu8_epi32(v2.Lo128);
            v3 = mm256_cvtepu8_epi32(v3.Hi128);
            v4 = mm256_cvtepu8_epi32(v4.Hi128);

            v0 = mm256_cvtepi32_ps(v0);
            v1 = mm256_cvtepi32_ps(v1);
            v2 = mm256_cvtepi32_ps(v2);
            v3 = mm256_cvtepi32_ps(v3);
            v4 = mm256_cvtepi32_ps(v4);

            v0 = mm256_mul_ps(v0, _1div255);
            v1 = mm256_mul_ps(v1, _1div255);
            v2 = mm256_mul_ps(v2, _1div255);
            v3 = mm256_mul_ps(v3, _1div255);
            v4 = mm256_mul_ps(v4, _1div255);

            mm256_storeu_ps(dst + i + 0, v0);
            mm256_storeu_ps(dst + i + 2, v1);
            mm256_storeu_ps(dst + i + 4, v2);
            mm256_storeu_ps(dst + i + 6, v3);
            mm256_storeu_ps(dst + i + 8, v4);
        }

        for (; i < count; i++)
        {
            dst[i] = new float4(src[i * 3 + 0] / 255.0f, src[i * 3 + 1] / 255.0f, src[i * 3 + 2] / 255.0f, 1.0f);
        }
    }