public void RunFldScenario() { var result = Avx.ConvertToVector256Int32(_fld); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(_fld, _dataTable.outArrayPtr); }
public static Vector256 <int> ConvertToVector256Int32(float *origin, uint index) { if (Avx.IsSupported) { return(Avx.ConvertToVector256Int32(Avx.LoadVector256(&origin[index]))); } return(default);
public void RunLclFldScenario() { var test = new SimpleUnaryOpTest__ConvertToVector256Int32Single(); var result = Avx.ConvertToVector256Int32(test._fld); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(test._fld, _dataTable.outArrayPtr); }
public void RunLclVarScenario_LoadAligned() { var firstOp = Avx.LoadAlignedVector256((Single *)(_dataTable.inArrayPtr)); var result = Avx.ConvertToVector256Int32(firstOp); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(firstOp, _dataTable.outArrayPtr); }
public void RunLclVarScenario_UnsafeRead() { var firstOp = Unsafe.Read <Vector256 <Single> >(_dataTable.inArrayPtr); var result = Avx.ConvertToVector256Int32(firstOp); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(firstOp, _dataTable.outArrayPtr); }
public void RunBasicScenario_LoadAligned() { var result = Avx.ConvertToVector256Int32( Avx.LoadAlignedVector256((Single *)(_dataTable.inArrayPtr)) ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(_dataTable.inArrayPtr, _dataTable.outArrayPtr); }
public void RunBasicScenario_UnsafeRead() { var result = Avx.ConvertToVector256Int32( Unsafe.Read <Vector256 <Single> >(_dataTable.inArrayPtr) ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(_dataTable.inArrayPtr, _dataTable.outArrayPtr); }
//floatで掛け算して、intで足し算 //これだと要素数100万程度で桁あふれする private unsafe double Test7Variance(byte[] vs) { int simdLength = Vector256 <int> .Count; int i; var vTotal = Vector256 <int> .Zero; fixed(byte *p = vs) { for (i = 0; i < vs.Length; i += simdLength) { Vector256 <int> v = Avx2.ConvertToVector256Int32(p + i);//01234567 Vector256 <float> inu = Avx.ConvertToVector256Single(v); Vector256 <float> vv = Avx.Multiply(inu, inu); v = Avx.ConvertToVector256Int32(vv); vTotal = Avx2.Add(vTotal, v); } } long total = 0; simdLength = Vector256 <int> .Count; int *temp = stackalloc int[simdLength]; Avx.Store(temp, vTotal); for (int j = 0; j < simdLength; j++) { total += temp[j]; } for (; i < vs.Length; i++) { total += vs[i]; } double average = (double)Test2(vs) / vs.Length; return(((double)total / vs.Length) - (average * average)); }
public static i32 Convertf32_i32(f32 a) { return(Avx.ConvertToVector256Int32(a)); }
unsafe void IConversionProcessor.ConvertLine(byte *ipstart, byte *opstart, int cb) { float *ip = (float *)ipstart, ipe = (float *)(ipstart + cb); byte * op = opstart; #if HWINTRINSICS if (Avx2.IsSupported && ipe >= ip + Vector256 <byte> .Count) { var vscale = Vector256.Create((float)byte.MaxValue); var vmaskp = Avx.LoadVector256((int *)Unsafe.AsPointer(ref MemoryMarshal.GetReference(HWIntrinsics.PermuteMaskDeinterleave8x32))); var vmaskq = Avx.LoadVector256((int *)Unsafe.AsPointer(ref MemoryMarshal.GetReference(HWIntrinsics.PermuteMask3xTo3Chan))); var vmasks = Avx2.BroadcastVector128ToVector256((byte *)Unsafe.AsPointer(ref MemoryMarshal.GetReference(HWIntrinsics.ShuffleMask3xTo3Chan))); ipe -= Vector256 <byte> .Count; do { var vf0 = Avx.Multiply(Avx.LoadVector256(ip), vscale); var vf1 = Avx.Multiply(Avx.LoadVector256(ip + Vector256 <float> .Count), vscale); var vf2 = Avx.Multiply(Avx.LoadVector256(ip + Vector256 <float> .Count * 2), vscale); var vf3 = Avx.Multiply(Avx.LoadVector256(ip + Vector256 <float> .Count * 3), vscale); ip += Vector256 <byte> .Count; var vi0 = Avx.ConvertToVector256Int32(vf0); var vi1 = Avx.ConvertToVector256Int32(vf1); var vi2 = Avx.ConvertToVector256Int32(vf2); var vi3 = Avx.ConvertToVector256Int32(vf3); var vs0 = Avx2.PackSignedSaturate(vi0, vi1); var vs1 = Avx2.PackSignedSaturate(vi2, vi3); var vb0 = Avx2.PackUnsignedSaturate(vs0, vs1); vb0 = Avx2.PermuteVar8x32(vb0.AsInt32(), vmaskp).AsByte(); vb0 = Avx2.Shuffle(vb0, vmasks); vb0 = Avx2.PermuteVar8x32(vb0.AsInt32(), vmaskq).AsByte(); if (ip >= ipe) { goto LastBlock; } Avx.Store(op, vb0); op += Vector256 <byte> .Count * 3 / 4; continue; LastBlock: Sse2.Store(op, vb0.GetLower()); Sse2.StoreScalar((long *)(op + Vector128 <byte> .Count), vb0.GetUpper().AsInt64()); op += Vector256 <byte> .Count * 3 / 4; break; } while (true); ipe += Vector256 <byte> .Count; } else if (Ssse3.IsSupported && ipe >= ip + Vector128 <byte> .Count) { var vscale = Vector128.Create((float)byte.MaxValue); var vmasks = Sse2.LoadVector128((byte *)Unsafe.AsPointer(ref MemoryMarshal.GetReference(HWIntrinsics.ShuffleMask3xTo3Chan))); ipe -= Vector128 <byte> .Count; do { var vf0 = Sse.Multiply(Sse.LoadVector128(ip), vscale); var vf1 = Sse.Multiply(Sse.LoadVector128(ip + Vector128 <float> .Count), vscale); var vf2 = Sse.Multiply(Sse.LoadVector128(ip + Vector128 <float> .Count * 2), vscale); var vf3 = Sse.Multiply(Sse.LoadVector128(ip + Vector128 <float> .Count * 3), vscale); ip += Vector128 <byte> .Count; var vi0 = Sse2.ConvertToVector128Int32(vf0); var vi1 = Sse2.ConvertToVector128Int32(vf1); var vi2 = Sse2.ConvertToVector128Int32(vf2); var vi3 = Sse2.ConvertToVector128Int32(vf3); var vs0 = Sse2.PackSignedSaturate(vi0, vi1); var vs1 = Sse2.PackSignedSaturate(vi2, vi3); var vb0 = Sse2.PackUnsignedSaturate(vs0, vs1); vb0 = Ssse3.Shuffle(vb0, vmasks); if (ip >= ipe) { goto LastBlock; } Sse2.Store(op, vb0); op += Vector128 <byte> .Count * 3 / 4; continue; LastBlock: var vl0 = vb0.AsInt64(); Sse2.StoreScalar((long *)op, vl0); Sse.StoreScalar((float *)(op + sizeof(long)), Sse2.UnpackHigh(vl0, vl0).AsSingle()); // https://github.com/dotnet/corefx/issues/41816 op += Vector128 <byte> .Count * 3 / 4; break; } while (true); ipe += Vector128 <byte> .Count; } else #endif { var vmin = new VectorF(byte.MinValue); var vmax = new VectorF(byte.MaxValue); var vround = new VectorF(0.5f); ipe -= VectorF.Count; while (ip <= ipe) { var v = Unsafe.ReadUnaligned <VectorF>(ip) * vmax + vround; v = v.Clamp(vmin, vmax); ip += VectorF.Count; #if VECTOR_CONVERT var vi = Vector.ConvertToInt32(v); #else var vi = v; #endif op[0] = (byte)vi[0]; op[1] = (byte)vi[1]; op[2] = (byte)vi[2]; if (VectorF.Count == 8) { op[3] = (byte)vi[4]; op[4] = (byte)vi[5]; op[5] = (byte)vi[6]; } op += VectorF.Count - VectorF.Count / 4; } ipe += VectorF.Count; } while (ip < ipe) { op[0] = FixToByte(ip[0]); op[1] = FixToByte(ip[1]); op[2] = FixToByte(ip[2]); ip += 4; op += 3; } }
unsafe void IConversionProcessor.ConvertLine(byte *ipstart, byte *opstart, int cb) { float *ip = (float *)ipstart, ipe = (float *)(ipstart + cb); byte * op = opstart; #if HWINTRINSICS if (Avx2.IsSupported) { var vzero = Vector256 <float> .Zero; var vmin = Vector256.Create(0.5f / byte.MaxValue); var vscale = Vector256.Create((float)byte.MaxValue); var vmaskp = Avx.LoadVector256((int *)Unsafe.AsPointer(ref MemoryMarshal.GetReference(HWIntrinsics.PermuteMaskDeinterleave8x32))); ipe -= Vector256 <byte> .Count; while (ip <= ipe) { var vf0 = Avx.LoadVector256(ip); var vf1 = Avx.LoadVector256(ip + Vector256 <float> .Count); var vf2 = Avx.LoadVector256(ip + Vector256 <float> .Count * 2); var vf3 = Avx.LoadVector256(ip + Vector256 <float> .Count * 3); ip += Vector256 <byte> .Count; var vfa0 = Avx.Shuffle(vf0, vf0, HWIntrinsics.ShuffleMaskAlpha); var vfa1 = Avx.Shuffle(vf1, vf1, HWIntrinsics.ShuffleMaskAlpha); var vfa2 = Avx.Shuffle(vf2, vf2, HWIntrinsics.ShuffleMaskAlpha); var vfa3 = Avx.Shuffle(vf3, vf3, HWIntrinsics.ShuffleMaskAlpha); vfa0 = Avx.Max(vfa0, vmin); vfa1 = Avx.Max(vfa1, vmin); vfa2 = Avx.Max(vfa2, vmin); vfa3 = Avx.Max(vfa3, vmin); vf0 = Avx.Multiply(vf0, Avx.Reciprocal(vfa0)); vf1 = Avx.Multiply(vf1, Avx.Reciprocal(vfa1)); vf2 = Avx.Multiply(vf2, Avx.Reciprocal(vfa2)); vf3 = Avx.Multiply(vf3, Avx.Reciprocal(vfa3)); vf0 = Avx.Blend(vf0, vfa0, HWIntrinsics.BlendMaskAlpha); vf1 = Avx.Blend(vf1, vfa1, HWIntrinsics.BlendMaskAlpha); vf2 = Avx.Blend(vf2, vfa2, HWIntrinsics.BlendMaskAlpha); vf3 = Avx.Blend(vf3, vfa3, HWIntrinsics.BlendMaskAlpha); vf0 = Avx.BlendVariable(vf0, vzero, HWIntrinsics.AvxCompareEqual(vfa0, vmin)); vf1 = Avx.BlendVariable(vf1, vzero, HWIntrinsics.AvxCompareEqual(vfa1, vmin)); vf2 = Avx.BlendVariable(vf2, vzero, HWIntrinsics.AvxCompareEqual(vfa2, vmin)); vf3 = Avx.BlendVariable(vf3, vzero, HWIntrinsics.AvxCompareEqual(vfa3, vmin)); vf0 = Avx.Multiply(vf0, vscale); vf1 = Avx.Multiply(vf1, vscale); vf2 = Avx.Multiply(vf2, vscale); vf3 = Avx.Multiply(vf3, vscale); var vi0 = Avx.ConvertToVector256Int32(vf0); var vi1 = Avx.ConvertToVector256Int32(vf1); var vi2 = Avx.ConvertToVector256Int32(vf2); var vi3 = Avx.ConvertToVector256Int32(vf3); var vs0 = Avx2.PackSignedSaturate(vi0, vi1); var vs1 = Avx2.PackSignedSaturate(vi2, vi3); var vb0 = Avx2.PackUnsignedSaturate(vs0, vs1); vb0 = Avx2.PermuteVar8x32(vb0.AsInt32(), vmaskp).AsByte(); Avx.Store(op, vb0); op += Vector256 <byte> .Count; } ipe += Vector256 <byte> .Count; } else if (Sse41.IsSupported) { var vzero = Vector128 <float> .Zero; var vmin = Vector128.Create(0.5f / byte.MaxValue); var vscale = Vector128.Create((float)byte.MaxValue); ipe -= Vector128 <byte> .Count; while (ip <= ipe) { var vf0 = Sse.LoadVector128(ip); var vf1 = Sse.LoadVector128(ip + Vector128 <float> .Count); var vf2 = Sse.LoadVector128(ip + Vector128 <float> .Count * 2); var vf3 = Sse.LoadVector128(ip + Vector128 <float> .Count * 3); ip += Vector128 <byte> .Count; var vfa0 = Sse.Shuffle(vf0, vf0, HWIntrinsics.ShuffleMaskAlpha); var vfa1 = Sse.Shuffle(vf1, vf1, HWIntrinsics.ShuffleMaskAlpha); var vfa2 = Sse.Shuffle(vf2, vf2, HWIntrinsics.ShuffleMaskAlpha); var vfa3 = Sse.Shuffle(vf3, vf3, HWIntrinsics.ShuffleMaskAlpha); vfa0 = Sse.Max(vfa0, vmin); vfa1 = Sse.Max(vfa1, vmin); vfa2 = Sse.Max(vfa2, vmin); vfa3 = Sse.Max(vfa3, vmin); vf0 = Sse.Multiply(vf0, Sse.Reciprocal(vfa0)); vf1 = Sse.Multiply(vf1, Sse.Reciprocal(vfa1)); vf2 = Sse.Multiply(vf2, Sse.Reciprocal(vfa2)); vf3 = Sse.Multiply(vf3, Sse.Reciprocal(vfa3)); vf0 = Sse41.Blend(vf0, vfa0, HWIntrinsics.BlendMaskAlpha); vf1 = Sse41.Blend(vf1, vfa1, HWIntrinsics.BlendMaskAlpha); vf2 = Sse41.Blend(vf2, vfa2, HWIntrinsics.BlendMaskAlpha); vf3 = Sse41.Blend(vf3, vfa3, HWIntrinsics.BlendMaskAlpha); vf0 = Sse41.BlendVariable(vf0, vzero, Sse.CompareEqual(vfa0, vmin)); vf1 = Sse41.BlendVariable(vf1, vzero, Sse.CompareEqual(vfa1, vmin)); vf2 = Sse41.BlendVariable(vf2, vzero, Sse.CompareEqual(vfa2, vmin)); vf3 = Sse41.BlendVariable(vf3, vzero, Sse.CompareEqual(vfa3, vmin)); vf0 = Sse.Multiply(vf0, vscale); vf1 = Sse.Multiply(vf1, vscale); vf2 = Sse.Multiply(vf2, vscale); vf3 = Sse.Multiply(vf3, vscale); var vi0 = Sse2.ConvertToVector128Int32(vf0); var vi1 = Sse2.ConvertToVector128Int32(vf1); var vi2 = Sse2.ConvertToVector128Int32(vf2); var vi3 = Sse2.ConvertToVector128Int32(vf3); var vs0 = Sse2.PackSignedSaturate(vi0, vi1); var vs1 = Sse2.PackSignedSaturate(vi2, vi3); var vb0 = Sse2.PackUnsignedSaturate(vs0, vs1); Sse2.Store(op, vb0); op += Vector128 <byte> .Count; } ipe += Vector128 <byte> .Count; } #endif float fmax = new Vector4(byte.MaxValue).X, fround = new Vector4(0.5f).X, fmin = fround / fmax; while (ip < ipe) { float f3 = ip[3]; if (f3 < fmin) { *(uint *)op = 0; } else { float f3i = fmax / f3; byte o0 = ClampToByte((int)(ip[0] * f3i + fround)); byte o1 = ClampToByte((int)(ip[1] * f3i + fround)); byte o2 = ClampToByte((int)(ip[2] * f3i + fround)); byte o3 = ClampToByte((int)(f3 * fmax + fround)); op[0] = o0; op[1] = o1; op[2] = o2; op[3] = o3; } ip += 4; op += 4; } }
unsafe void IConversionProcessor.ConvertLine(byte *ipstart, byte *opstart, int cb) { float *ip = (float *)ipstart, ipe = (float *)(ipstart + cb); byte * op = opstart; #if HWINTRINSICS if (Avx2.IsSupported) { var vscale = Vector256.Create((float)byte.MaxValue); var vmaskp = Avx.LoadVector256((int *)Unsafe.AsPointer(ref MemoryMarshal.GetReference(HWIntrinsics.PermuteMaskDeinterleave8x32))); ipe -= Vector256 <byte> .Count; while (ip <= ipe) { var vf0 = Avx.Multiply(vscale, Avx.LoadVector256(ip)); var vf1 = Avx.Multiply(vscale, Avx.LoadVector256(ip + Vector256 <float> .Count)); var vf2 = Avx.Multiply(vscale, Avx.LoadVector256(ip + Vector256 <float> .Count * 2)); var vf3 = Avx.Multiply(vscale, Avx.LoadVector256(ip + Vector256 <float> .Count * 3)); ip += Vector256 <byte> .Count; var vi0 = Avx.ConvertToVector256Int32(vf0); var vi1 = Avx.ConvertToVector256Int32(vf1); var vi2 = Avx.ConvertToVector256Int32(vf2); var vi3 = Avx.ConvertToVector256Int32(vf3); var vs0 = Avx2.PackSignedSaturate(vi0, vi1); var vs1 = Avx2.PackSignedSaturate(vi2, vi3); var vb0 = Avx2.PackUnsignedSaturate(vs0, vs1); vb0 = Avx2.PermuteVar8x32(vb0.AsInt32(), vmaskp).AsByte(); Avx.Store(op, vb0); op += Vector256 <byte> .Count; } ipe += Vector256 <byte> .Count; } else if (Sse2.IsSupported) { var vscale = Vector128.Create((float)byte.MaxValue); ipe -= Vector128 <byte> .Count; while (ip <= ipe) { var vf0 = Sse.Multiply(vscale, Sse.LoadVector128(ip)); var vf1 = Sse.Multiply(vscale, Sse.LoadVector128(ip + Vector128 <float> .Count)); var vf2 = Sse.Multiply(vscale, Sse.LoadVector128(ip + Vector128 <float> .Count * 2)); var vf3 = Sse.Multiply(vscale, Sse.LoadVector128(ip + Vector128 <float> .Count * 3)); ip += Vector128 <byte> .Count; var vi0 = Sse2.ConvertToVector128Int32(vf0); var vi1 = Sse2.ConvertToVector128Int32(vf1); var vi2 = Sse2.ConvertToVector128Int32(vf2); var vi3 = Sse2.ConvertToVector128Int32(vf3); var vs0 = Sse2.PackSignedSaturate(vi0, vi1); var vs1 = Sse2.PackSignedSaturate(vi2, vi3); var vb0 = Sse2.PackUnsignedSaturate(vs0, vs1); Sse2.Store(op, vb0); op += Vector128 <byte> .Count; } ipe += Vector128 <byte> .Count; } else #endif { #if VECTOR_CONVERT int unrollCount = Vector <byte> .Count; var vmin = new Vector <short>(byte.MinValue); var vmax = new Vector <short>(byte.MaxValue); var vscale = new VectorF(byte.MaxValue); #else int unrollCount = VectorF.Count; var vmin = new VectorF(byte.MinValue); var vmax = new VectorF(byte.MaxValue); #endif var vround = new VectorF(0.5f); ipe -= unrollCount; while (ip <= ipe) { #if VECTOR_CONVERT var vf0 = Unsafe.ReadUnaligned <VectorF>(ip); var vf1 = Unsafe.ReadUnaligned <VectorF>(ip + VectorF.Count); var vf2 = Unsafe.ReadUnaligned <VectorF>(ip + VectorF.Count * 2); var vf3 = Unsafe.ReadUnaligned <VectorF>(ip + VectorF.Count * 3); vf0 = vf0 * vscale + vround; vf1 = vf1 * vscale + vround; vf2 = vf2 * vscale + vround; vf3 = vf3 * vscale + vround; var vi0 = Vector.ConvertToInt32(vf0); var vi1 = Vector.ConvertToInt32(vf1); var vi2 = Vector.ConvertToInt32(vf2); var vi3 = Vector.ConvertToInt32(vf3); var vs0 = Vector.Narrow(vi0, vi1); var vs1 = Vector.Narrow(vi2, vi3); vs0 = vs0.Clamp(vmin, vmax); vs1 = vs1.Clamp(vmin, vmax); var vb = Vector.Narrow(Vector.AsVectorUInt16(vs0), Vector.AsVectorUInt16(vs1)); Unsafe.WriteUnaligned(op, vb); #else var v = Unsafe.ReadUnaligned <VectorF>(ip) * vmax + vround; v = v.Clamp(vmin, vmax); op[0] = (byte)v[0]; op[1] = (byte)v[1]; op[2] = (byte)v[2]; op[3] = (byte)v[3]; if (VectorF.Count == 8) { op[4] = (byte)v[4]; op[5] = (byte)v[5]; op[6] = (byte)v[6]; op[7] = (byte)v[7]; } #endif ip += unrollCount; op += unrollCount; } ipe += unrollCount; } while (ip < ipe) { op[0] = FixToByte(ip[0]); ip++; op++; } }