예제 #1
0
    public static unsafe void V256Filter(int *src, int srcCount, int target, int *dst, out int dstCount)
    {
        var alignedCount = srcCount & ~7;
        var dstPtr       = dst;
        int i            = 0;

        for (; i < alignedCount; i += 8)
        {
            var val    = mm256_loadu_si256(src + i);
            var cmp    = mm256_cmpeq_epi32(val, mm256_set1_epi32(target));
            var packed = SIMDHelpers.LeftPack8PS(cmp, val);
            mm256_storeu_si256(dstPtr, packed);
            var mask = mm256_movemask_ps(cmp);
            dstPtr += popcnt_u32((uint)mask);
        }

        for (; i < srcCount; i++)
        {
            if (src[i] == target)
            {
                *(dstPtr++) = src[i];
            }
        }

        dstCount = (int)(dstPtr - dst);
    }
예제 #2
0
    private static unsafe void V256Filter([NoAlias] int *src, int srcCount, int greaterThan, int lessThan, [NoAlias] int *dst, [NoAlias] out int dstCount)
    {
        var alignedCount = srcCount & ~7;
        var dstPtr       = dst;
        int i            = 0;

        for (; i < alignedCount; i += 8)
        {
            var val    = mm256_loadu_si256(src + i);
            var cmpLt  = mm256_cmpgt_epi32(mm256_set1_epi32(lessThan), val);
            var cmpGt  = mm256_cmpgt_epi32(val, mm256_set1_epi32(greaterThan));
            var cmp    = mm256_and_ps(cmpLt, cmpGt);
            var packed = SIMDHelpers.LeftPack8PS(cmp, val);
            mm256_storeu_si256(dstPtr, packed);
            var mask = mm256_movemask_ps(cmp);
            dstPtr += popcnt_u32((uint)mask);
        }

        for (; i < srcCount; i++)
        {
            if (src[i] < lessThan && src[i] > greaterThan)
            {
                *(dstPtr++) = src[i];
            }
        }

        dstCount = (int)(dstPtr - dst);
    }