public void RunClassFldScenario() { TestLibrary.TestFramework.BeginScenario(nameof(RunClassFldScenario)); var result = Sse41.BlendVariable(_fld1, _fld2, _fld3); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(_fld1, _fld2, _fld3, _dataTable.outArrayPtr); }
public void RunLclVarScenario_LoadAligned() { var firstOp = Sse2.LoadAlignedVector128((Byte *)(_dataTable.inArray1Ptr)); var secondOp = Sse2.LoadAlignedVector128((Byte *)(_dataTable.inArray2Ptr)); var thirdOp = Sse2.LoadAlignedVector128((Byte *)(_dataTable.inArray3Ptr)); var result = Sse41.BlendVariable(firstOp, secondOp, thirdOp); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(firstOp, secondOp, thirdOp, _dataTable.outArrayPtr); }
public void RunClassLclFldScenario() { TestLibrary.TestFramework.BeginScenario(nameof(RunClassLclFldScenario)); var test = new SimpleTernaryOpTest__BlendVariableUInt16(); var result = Sse41.BlendVariable(test._fld1, test._fld2, test._fld3); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(test._fld1, test._fld2, test._fld3, _dataTable.outArrayPtr); }
public static int Main() { if (Sse41.IsSupported) { Vector128 <int> left = Vector128.Create(1); Vector128 <int> right = Vector128.Create(2); ref var rightRef = ref right; Vector128 <int> mask = Vector128.Create(3); Sse41.BlendVariable(left, rightRef, mask); }
public void RunLclVarScenario_UnsafeRead() { var firstOp = Unsafe.Read <Vector128 <Byte> >(_dataTable.inArray1Ptr); var secondOp = Unsafe.Read <Vector128 <Byte> >(_dataTable.inArray2Ptr); var thirdOp = Unsafe.Read <Vector128 <Byte> >(_dataTable.inArray3Ptr); var result = Sse41.BlendVariable(firstOp, secondOp, thirdOp); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(firstOp, secondOp, thirdOp, _dataTable.outArrayPtr); }
public void RunStructLclFldScenario() { TestLibrary.TestFramework.BeginScenario(nameof(RunStructLclFldScenario)); var test = TestStruct.Create(); var result = Sse41.BlendVariable(test._fld1, test._fld2, test._fld3); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(test._fld1, test._fld2, test._fld3, _dataTable.outArrayPtr); }
public static f32 Select_f32(m32 m, f32 a, f32 b) { if (Sse41.IsSupported) { return(Sse41.BlendVariable(b, a, m.AsSingle())); } else { return(Xor(b, And(m.AsSingle(), Xor(a, b)))); } }
public void RunClsVarScenario() { var result = Sse41.BlendVariable( _clsVar1, _clsVar2, _clsVar3 ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(_clsVar1, _clsVar2, _clsVar3, _dataTable.outArrayPtr); }
public static i32 Select_i32(m32 m, i32 a, i32 b) { if (Sse41.IsSupported) { return(Sse41.BlendVariable(b, a, m)); } else { return(Xor(b, And(m, Xor(a, b)))); } }
public void RunBasicScenario_LoadAligned() { var result = Sse41.BlendVariable( Sse2.LoadAlignedVector128((Byte *)(_dataTable.inArray1Ptr)), Sse2.LoadAlignedVector128((Byte *)(_dataTable.inArray2Ptr)), Sse2.LoadAlignedVector128((Byte *)(_dataTable.inArray3Ptr)) ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(_dataTable.inArray1Ptr, _dataTable.inArray2Ptr, _dataTable.inArray3Ptr, _dataTable.outArrayPtr); }
public void RunBasicScenario_UnsafeRead() { var result = Sse41.BlendVariable( Unsafe.Read <Vector128 <Byte> >(_dataTable.inArray1Ptr), Unsafe.Read <Vector128 <Byte> >(_dataTable.inArray2Ptr), Unsafe.Read <Vector128 <Byte> >(_dataTable.inArray3Ptr) ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(_dataTable.inArray1Ptr, _dataTable.inArray2Ptr, _dataTable.inArray3Ptr, _dataTable.outArrayPtr); }
public void RunLclVarScenario_LoadAligned() { TestLibrary.TestFramework.BeginScenario(nameof(RunLclVarScenario_LoadAligned)); var op1 = Sse2.LoadAlignedVector128((Int16 *)(_dataTable.inArray1Ptr)); var op2 = Sse2.LoadAlignedVector128((Int16 *)(_dataTable.inArray2Ptr)); var op3 = Sse2.LoadAlignedVector128((Int16 *)(_dataTable.inArray3Ptr)); var result = Sse41.BlendVariable(op1, op2, op3); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(op1, op2, op3, _dataTable.outArrayPtr); }
public void RunLclVarScenario_UnsafeRead() { TestLibrary.TestFramework.BeginScenario(nameof(RunLclVarScenario_UnsafeRead)); var op1 = Unsafe.Read <Vector128 <Int16> >(_dataTable.inArray1Ptr); var op2 = Unsafe.Read <Vector128 <Int16> >(_dataTable.inArray2Ptr); var op3 = Unsafe.Read <Vector128 <Int16> >(_dataTable.inArray3Ptr); var result = Sse41.BlendVariable(op1, op2, op3); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(op1, op2, op3, _dataTable.outArrayPtr); }
public void RunLclVarScenario_UnsafeRead() { TestLibrary.TestFramework.BeginScenario(nameof(RunLclVarScenario_UnsafeRead)); var firstOp = Unsafe.Read <Vector128 <SByte> >(_dataTable.inArray1Ptr); var secondOp = Unsafe.Read <Vector128 <SByte> >(_dataTable.inArray2Ptr); var thirdOp = Unsafe.Read <Vector128 <SByte> >(_dataTable.inArray3Ptr); var result = Sse41.BlendVariable(firstOp, secondOp, thirdOp); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(firstOp, secondOp, thirdOp, _dataTable.outArrayPtr); }
public void RunLclVarScenario_LoadAligned() { TestLibrary.TestFramework.BeginScenario(nameof(RunLclVarScenario_LoadAligned)); var firstOp = Sse2.LoadAlignedVector128((UInt16 *)(_dataTable.inArray1Ptr)); var secondOp = Sse2.LoadAlignedVector128((UInt16 *)(_dataTable.inArray2Ptr)); var thirdOp = Sse2.LoadAlignedVector128((UInt16 *)(_dataTable.inArray3Ptr)); var result = Sse41.BlendVariable(firstOp, secondOp, thirdOp); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(firstOp, secondOp, thirdOp, _dataTable.outArrayPtr); }
public void RunBasicScenario_UnsafeRead() { TestLibrary.TestFramework.BeginScenario(nameof(RunBasicScenario_UnsafeRead)); var result = Sse41.BlendVariable( Unsafe.Read <Vector128 <UInt16> >(_dataTable.inArray1Ptr), Unsafe.Read <Vector128 <UInt16> >(_dataTable.inArray2Ptr), Unsafe.Read <Vector128 <UInt16> >(_dataTable.inArray3Ptr) ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(_dataTable.inArray1Ptr, _dataTable.inArray2Ptr, _dataTable.inArray3Ptr, _dataTable.outArrayPtr); }
public void RunBasicScenario_LoadAligned() { TestLibrary.TestFramework.BeginScenario(nameof(RunBasicScenario_LoadAligned)); var result = Sse41.BlendVariable( Sse2.LoadAlignedVector128((UInt16 *)(_dataTable.inArray1Ptr)), Sse2.LoadAlignedVector128((UInt16 *)(_dataTable.inArray2Ptr)), Sse2.LoadAlignedVector128((UInt16 *)(_dataTable.inArray3Ptr)) ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(_dataTable.inArray1Ptr, _dataTable.inArray2Ptr, _dataTable.inArray3Ptr, _dataTable.outArrayPtr); }
public void RunStructLclFldScenario_Load() { TestLibrary.TestFramework.BeginScenario(nameof(RunStructLclFldScenario_Load)); var test = TestStruct.Create(); var result = Sse41.BlendVariable( Sse2.LoadVector128((Int16 *)(&test._fld1)), Sse2.LoadVector128((Int16 *)(&test._fld2)), Sse2.LoadVector128((Int16 *)(&test._fld3)) ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(test._fld1, test._fld2, test._fld3, _dataTable.outArrayPtr); }
public void RunStructFldScenario_Load(SimpleTernaryOpTest__BlendVariableInt16 testClass) { fixed(Vector128 <Int16> *pFld1 = &_fld1) fixed(Vector128 <Int16> *pFld2 = &_fld2) fixed(Vector128 <Int16> *pFld3 = &_fld3) { var result = Sse41.BlendVariable( Sse2.LoadVector128((Int16 *)(pFld1)), Sse2.LoadVector128((Int16 *)(pFld2)), Sse2.LoadVector128((Int16 *)(pFld3)) ); Unsafe.Write(testClass._dataTable.outArrayPtr, result); testClass.ValidateResult(_fld1, _fld2, _fld3, testClass._dataTable.outArrayPtr); } }
private static unsafe int FillBuffer(ReadOnlySpan <char> input) { int count = Math.Min(LineBuffer.Length, input.Length); int i = 0; fixed(char *buffer = LineBuffer, pInput = input) { if (Sse2.IsSupported && count >= Vector128 <ushort> .Count) { Vector128 <ushort> Space = Vector128.Create(SpaceCharUShort); //Space character do { var data = Sse2.LoadVector128((ushort *)pInput + i); var comp = Vector128 <ushort> .Zero; comp = Sse2.CompareEqual(comp, data); if (Sse41.IsSupported) { data = Sse41.BlendVariable(data, Space, comp); } else { comp = Sse2.And(comp, Space); data = Sse2.Or(data, comp); //Elements being replaced are already 0'ed } Sse2.Store((ushort *)buffer + i, data); i += Vector128 <ushort> .Count; }while ((count - i) >= Vector128 <ushort> .Count); } while (i < count) { char tmp = pInput[i]; buffer[i] = tmp == 0 ? ' ' : tmp; i += 1; } return(count); } }
public void RunClsVarScenario_Load() { TestLibrary.TestFramework.BeginScenario(nameof(RunClsVarScenario_Load)); fixed(Vector128 <Int32> *pClsVar1 = &_clsVar1) fixed(Vector128 <Int32> *pClsVar2 = &_clsVar2) fixed(Vector128 <Int32> *pClsVar3 = &_clsVar3) { var result = Sse41.BlendVariable( Sse2.LoadVector128((Int32 *)(pClsVar1)), Sse2.LoadVector128((Int32 *)(pClsVar2)), Sse2.LoadVector128((Int32 *)(pClsVar3)) ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(_clsVar1, _clsVar2, _clsVar3, _dataTable.outArrayPtr); } }
public void RunClassFldScenario_Load() { TestLibrary.TestFramework.BeginScenario(nameof(RunClassFldScenario_Load)); fixed(Vector128 <Int16> *pFld1 = &_fld1) fixed(Vector128 <Int16> *pFld2 = &_fld2) fixed(Vector128 <Int16> *pFld3 = &_fld3) { var result = Sse41.BlendVariable( Sse2.LoadVector128((Int16 *)(pFld1)), Sse2.LoadVector128((Int16 *)(pFld2)), Sse2.LoadVector128((Int16 *)(pFld3)) ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(_fld1, _fld2, _fld3, _dataTable.outArrayPtr); } }
public static Vector128 <T> Select <T, U>(Vector128 <T> left, Vector128 <T> right, Vector128 <U> selector) where T : struct where U : struct { if (Sse41.IsSupported) { if (typeof(T) == typeof(float)) { return(Sse41.BlendVariable(left.AsSingle(), right.AsSingle(), selector.AsSingle()).As <float, T>()); } else if (typeof(T) == typeof(double)) { return(Sse41.BlendVariable(left.AsDouble(), right.AsDouble(), selector.AsDouble()).As <double, T>()); } return(Sse41.BlendVariable(left.AsByte(), right.AsByte(), selector.AsByte()).As <byte, T>()); } return(Or(And(selector.As <U, T>(), right), AndNot(selector.As <U, T>(), left))); }
public void RunClassLclFldScenario_Load() { TestLibrary.TestFramework.BeginScenario(nameof(RunClassLclFldScenario_Load)); var test = new SimpleTernaryOpTest__BlendVariableInt32(); fixed(Vector128 <Int32> *pFld1 = &test._fld1) fixed(Vector128 <Int32> *pFld2 = &test._fld2) fixed(Vector128 <Int32> *pFld3 = &test._fld3) { var result = Sse41.BlendVariable( Sse2.LoadVector128((Int32 *)(pFld1)), Sse2.LoadVector128((Int32 *)(pFld2)), Sse2.LoadVector128((Int32 *)(pFld3)) ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(test._fld1, test._fld2, test._fld3, _dataTable.outArrayPtr); } }
private static unsafe void ReplacePlusWithSpaceCore(Span <char> buffer, IntPtr state) { fixed(char *ptr = &MemoryMarshal.GetReference(buffer)) { var input = (ushort *)state.ToPointer(); var output = (ushort *)ptr; var i = (nint)0; var n = (nint)(uint)buffer.Length; if (Sse41.IsSupported && n >= Vector128 <ushort> .Count) { var vecPlus = Vector128.Create((ushort)'+'); var vecSpace = Vector128.Create((ushort)' '); do { var vec = Sse2.LoadVector128(input + i); var mask = Sse2.CompareEqual(vec, vecPlus); var res = Sse41.BlendVariable(vec, vecSpace, mask); Sse2.Store(output + i, res); i += Vector128 <ushort> .Count; } while (i <= n - Vector128 <ushort> .Count); } for (; i < n; ++i) { if (input[i] != '+') { output[i] = input[i]; } else { output[i] = ' '; } } } }
public unsafe void Serialize(ref MessagePackWriter writer, sbyte[]?value, MessagePackSerializerOptions options) { if (value == null) { writer.WriteNil(); return; } var inputLength = value.Length; writer.WriteArrayHeader(inputLength); if (inputLength == 0) { return; } fixed(sbyte *pSource = &value[0]) { var inputEnd = pSource + inputLength; var inputIterator = pSource; if (Popcnt.IsSupported) { const int ShiftCount = 4; const int Stride = 1 << ShiftCount; // We enter the SIMD mode when there are more than the Stride after alignment adjustment. if (inputLength < Stride << 1) { goto ProcessEach; } { // Make InputIterator Aligned var offset = UnsafeMemoryAlignmentUtility.CalculateDifferenceAlign16(inputIterator); inputLength -= offset; var offsetEnd = inputIterator + offset; while (inputIterator != offsetEnd) { writer.Write(*inputIterator++); } } fixed(byte *tablePointer = &ShuffleAndMaskTable[0]) { fixed(byte *maskTablePointer = &SingleInstructionMultipleDataPrimitiveArrayFormatterHelper.StoreMaskTable[0]) { var vectorMinFixNegInt = Vector128.Create((sbyte)MessagePackRange.MinFixNegativeInt); var vectorMessagePackCodeInt8 = Vector128.Create(MessagePackCode.Int8); for (var vectorizedEnd = inputIterator + ((inputLength >> ShiftCount) << ShiftCount); inputIterator != vectorizedEnd; inputIterator += Stride) { var current = Sse2.LoadVector128(inputIterator); var index = unchecked ((uint)Sse2.MoveMask(Sse2.CompareGreaterThan(vectorMinFixNegInt, current))); if (index == 0) { // When all 32 input values are in the FixNum range. var span = writer.GetSpan(Stride); Sse2.Store((sbyte *)Unsafe.AsPointer(ref span[0]), current); writer.Advance(Stride); continue; } unchecked { var index0 = (byte)index; var index1 = (byte)(index >> 8); var count0 = (int)(Popcnt.PopCount(index0) + 8); var count1 = (int)(Popcnt.PopCount(index1) + 8); var countTotal = count0 + count1; var destination = writer.GetSpan(countTotal); fixed(byte *pDestination = &destination[0]) { var tempDestination = pDestination; var shuffle0 = Sse2.LoadVector128(tablePointer + (index0 << 4)); var shuffled0 = Ssse3.Shuffle(current.AsByte(), shuffle0); var answer0 = Sse41.BlendVariable(shuffled0, vectorMessagePackCodeInt8, shuffle0); Sse2.MaskMove(answer0, Sse2.LoadVector128(maskTablePointer + (count0 << 4)), tempDestination); tempDestination += count0; var shuffle1 = Sse2.LoadVector128(tablePointer + (index1 << 4)); var shift1 = Sse2.ShiftRightLogical128BitLane(current.AsByte(), 8); var shuffled1 = Ssse3.Shuffle(shift1, shuffle1); var answer1 = Sse41.BlendVariable(shuffled1, vectorMessagePackCodeInt8, shuffle1); Sse2.MaskMove(answer1, Sse2.LoadVector128(maskTablePointer + (count1 << 4)), tempDestination); } writer.Advance(countTotal); } } } } } ProcessEach: while (inputIterator != inputEnd) { writer.Write(*inputIterator++); } } }
unsafe void IConversionProcessor.ConvertLine(byte *ipstart, byte *opstart, int cb) { float *ip = (float *)ipstart, ipe = (float *)(ipstart + cb); byte * op = opstart; #if HWINTRINSICS if (Avx2.IsSupported) { var vzero = Vector256 <float> .Zero; var vmin = Vector256.Create(0.5f / byte.MaxValue); var vscale = Vector256.Create((float)byte.MaxValue); var vmaskp = Avx.LoadVector256((int *)Unsafe.AsPointer(ref MemoryMarshal.GetReference(HWIntrinsics.PermuteMaskDeinterleave8x32))); ipe -= Vector256 <byte> .Count; while (ip <= ipe) { var vf0 = Avx.LoadVector256(ip); var vf1 = Avx.LoadVector256(ip + Vector256 <float> .Count); var vf2 = Avx.LoadVector256(ip + Vector256 <float> .Count * 2); var vf3 = Avx.LoadVector256(ip + Vector256 <float> .Count * 3); ip += Vector256 <byte> .Count; var vfa0 = Avx.Shuffle(vf0, vf0, HWIntrinsics.ShuffleMaskAlpha); var vfa1 = Avx.Shuffle(vf1, vf1, HWIntrinsics.ShuffleMaskAlpha); var vfa2 = Avx.Shuffle(vf2, vf2, HWIntrinsics.ShuffleMaskAlpha); var vfa3 = Avx.Shuffle(vf3, vf3, HWIntrinsics.ShuffleMaskAlpha); vfa0 = Avx.Max(vfa0, vmin); vfa1 = Avx.Max(vfa1, vmin); vfa2 = Avx.Max(vfa2, vmin); vfa3 = Avx.Max(vfa3, vmin); vf0 = Avx.Multiply(vf0, Avx.Reciprocal(vfa0)); vf1 = Avx.Multiply(vf1, Avx.Reciprocal(vfa1)); vf2 = Avx.Multiply(vf2, Avx.Reciprocal(vfa2)); vf3 = Avx.Multiply(vf3, Avx.Reciprocal(vfa3)); vf0 = Avx.Blend(vf0, vfa0, HWIntrinsics.BlendMaskAlpha); vf1 = Avx.Blend(vf1, vfa1, HWIntrinsics.BlendMaskAlpha); vf2 = Avx.Blend(vf2, vfa2, HWIntrinsics.BlendMaskAlpha); vf3 = Avx.Blend(vf3, vfa3, HWIntrinsics.BlendMaskAlpha); vf0 = Avx.BlendVariable(vf0, vzero, HWIntrinsics.AvxCompareEqual(vfa0, vmin)); vf1 = Avx.BlendVariable(vf1, vzero, HWIntrinsics.AvxCompareEqual(vfa1, vmin)); vf2 = Avx.BlendVariable(vf2, vzero, HWIntrinsics.AvxCompareEqual(vfa2, vmin)); vf3 = Avx.BlendVariable(vf3, vzero, HWIntrinsics.AvxCompareEqual(vfa3, vmin)); vf0 = Avx.Multiply(vf0, vscale); vf1 = Avx.Multiply(vf1, vscale); vf2 = Avx.Multiply(vf2, vscale); vf3 = Avx.Multiply(vf3, vscale); var vi0 = Avx.ConvertToVector256Int32(vf0); var vi1 = Avx.ConvertToVector256Int32(vf1); var vi2 = Avx.ConvertToVector256Int32(vf2); var vi3 = Avx.ConvertToVector256Int32(vf3); var vs0 = Avx2.PackSignedSaturate(vi0, vi1); var vs1 = Avx2.PackSignedSaturate(vi2, vi3); var vb0 = Avx2.PackUnsignedSaturate(vs0, vs1); vb0 = Avx2.PermuteVar8x32(vb0.AsInt32(), vmaskp).AsByte(); Avx.Store(op, vb0); op += Vector256 <byte> .Count; } ipe += Vector256 <byte> .Count; } else if (Sse41.IsSupported) { var vzero = Vector128 <float> .Zero; var vmin = Vector128.Create(0.5f / byte.MaxValue); var vscale = Vector128.Create((float)byte.MaxValue); ipe -= Vector128 <byte> .Count; while (ip <= ipe) { var vf0 = Sse.LoadVector128(ip); var vf1 = Sse.LoadVector128(ip + Vector128 <float> .Count); var vf2 = Sse.LoadVector128(ip + Vector128 <float> .Count * 2); var vf3 = Sse.LoadVector128(ip + Vector128 <float> .Count * 3); ip += Vector128 <byte> .Count; var vfa0 = Sse.Shuffle(vf0, vf0, HWIntrinsics.ShuffleMaskAlpha); var vfa1 = Sse.Shuffle(vf1, vf1, HWIntrinsics.ShuffleMaskAlpha); var vfa2 = Sse.Shuffle(vf2, vf2, HWIntrinsics.ShuffleMaskAlpha); var vfa3 = Sse.Shuffle(vf3, vf3, HWIntrinsics.ShuffleMaskAlpha); vfa0 = Sse.Max(vfa0, vmin); vfa1 = Sse.Max(vfa1, vmin); vfa2 = Sse.Max(vfa2, vmin); vfa3 = Sse.Max(vfa3, vmin); vf0 = Sse.Multiply(vf0, Sse.Reciprocal(vfa0)); vf1 = Sse.Multiply(vf1, Sse.Reciprocal(vfa1)); vf2 = Sse.Multiply(vf2, Sse.Reciprocal(vfa2)); vf3 = Sse.Multiply(vf3, Sse.Reciprocal(vfa3)); vf0 = Sse41.Blend(vf0, vfa0, HWIntrinsics.BlendMaskAlpha); vf1 = Sse41.Blend(vf1, vfa1, HWIntrinsics.BlendMaskAlpha); vf2 = Sse41.Blend(vf2, vfa2, HWIntrinsics.BlendMaskAlpha); vf3 = Sse41.Blend(vf3, vfa3, HWIntrinsics.BlendMaskAlpha); vf0 = Sse41.BlendVariable(vf0, vzero, Sse.CompareEqual(vfa0, vmin)); vf1 = Sse41.BlendVariable(vf1, vzero, Sse.CompareEqual(vfa1, vmin)); vf2 = Sse41.BlendVariable(vf2, vzero, Sse.CompareEqual(vfa2, vmin)); vf3 = Sse41.BlendVariable(vf3, vzero, Sse.CompareEqual(vfa3, vmin)); vf0 = Sse.Multiply(vf0, vscale); vf1 = Sse.Multiply(vf1, vscale); vf2 = Sse.Multiply(vf2, vscale); vf3 = Sse.Multiply(vf3, vscale); var vi0 = Sse2.ConvertToVector128Int32(vf0); var vi1 = Sse2.ConvertToVector128Int32(vf1); var vi2 = Sse2.ConvertToVector128Int32(vf2); var vi3 = Sse2.ConvertToVector128Int32(vf3); var vs0 = Sse2.PackSignedSaturate(vi0, vi1); var vs1 = Sse2.PackSignedSaturate(vi2, vi3); var vb0 = Sse2.PackUnsignedSaturate(vs0, vs1); Sse2.Store(op, vb0); op += Vector128 <byte> .Count; } ipe += Vector128 <byte> .Count; } #endif float fmax = new Vector4(byte.MaxValue).X, fround = new Vector4(0.5f).X, fmin = fround / fmax; while (ip < ipe) { float f3 = ip[3]; if (f3 < fmin) { *(uint *)op = 0; } else { float f3i = fmax / f3; byte o0 = ClampToByte((int)(ip[0] * f3i + fround)); byte o1 = ClampToByte((int)(ip[1] * f3i + fround)); byte o2 = ClampToByte((int)(ip[2] * f3i + fround)); byte o3 = ClampToByte((int)(f3 * fmax + fround)); op[0] = o0; op[1] = o1; op[2] = o2; op[3] = o3; } ip += 4; op += 4; } }
public static Vector128 <sbyte> _mm_blendv_epi8(Vector128 <sbyte> left, Vector128 <sbyte> right, Vector128 <sbyte> mask) { return(Sse41.BlendVariable(left, right, mask)); }
public void ResizeBicubic(FastBitmap rtnImage) { float scaleX = (float)this.width / rtnImage.width; float scaleY = (float)this.height / rtnImage.height; if (scaleX > 1 || scaleY > 1) { throw new Exception("拡大のみ対応"); } float[] tmpa = new float[rtnImage.width * 4 * this.height]; fixed(float *tmpp = tmpa) { float *tmp = tmpp; var _00mask = Vector128.Create(0, 255, 255, 255, 1, 255, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255); var _01mask = Vector128.Create(4, 255, 255, 255, 5, 255, 255, 255, 6, 255, 255, 255, 7, 255, 255, 255); var _10mask = Vector128.Create(8, 255, 255, 255, 9, 255, 255, 255, 10, 255, 255, 255, 11, 255, 255, 255); var _11mask = Vector128.Create(12, 255, 255, 255, 13, 255, 255, 255, 14, 255, 255, 255, 15, 255, 255, 255); var _vmask = Vector128.Create(0, 4, 8, 12, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255); var _1012 = Vector128.Create(-1, 0, 1, 2); var _0123i = Vector128.Create(0, 1, 2, 3); var _0000 = Vector128.Create(0, 0, 0, 0); var _0000f = Vector128.Create(0f, 0, 0, 0); var _255f = Vector128.Create(255f, 255, 255, 255); var _1111 = Vector128.Create(1, 1, 1, 1); var _1111f = Vector128.Create(1f, 1, 1, 1); var _4444f = Vector128.Create(4f, 4, 4, 4); var _4444 = Vector128.Create(4, 4, 4, 4); var _5555f = Vector128.Create(5f, 5, 5, 5); var _2222f = Vector128.Create(2f, 2, 2, 2); var _8888f = Vector128.Create(8f, 8, 8, 8); var _7f = Vector128.Create(0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff).AsSingle(); var _ff = Vector128.Create(-1, -1, -1, -1); var _stride = Vector128.Create(rtnImage.width * 4, rtnImage.width * 4, rtnImage.width * 4, rtnImage.width * 4); Parallel.For(0, this.height, (y) => { float py = (y * scaleY); float *tmpPos = tmp + y * rtnImage.width * 4; for (int x = 0; x < rtnImage.width; x++) { float px = (x * scaleX); int sx = (int)px; var _px = Vector128.CreateScalar(px); _px = Sse.Shuffle(_px, _px, 0); var _sx = Vector128.CreateScalar(sx); _sx = Sse2.Shuffle(_sx, 0); var _width = Vector128.CreateScalar(this.width); _width = Sse2.Shuffle(_width, 0); var _x2 = Sse2.Add(_sx, _1012); var _d = Sse.And(Sse.Subtract(_px, Sse2.ConvertToVector128Single(_x2)), _7f); var _d2 = Sse.Multiply(_d, _d); var _d3 = Sse.Multiply(_d2, _d); var w1 = Sse.Add(_1111f, Sse.Subtract(_d3, Sse.Multiply(_2222f, _d2))); var w2 = Sse.Subtract(Sse.Subtract(Sse.Add(_4444f, Sse.Multiply(_5555f, _d2)), Sse.Multiply(_d, _8888f)), _d3); var wb = Sse2.CompareGreaterThan(_d, _1111f); var _w = Sse41.BlendVariable(w1, w2, wb); var _xpb = Sse2.Or(Sse2.CompareLessThan(_x2, _0000), Sse41.MultiplyLow(Sse2.AndNot(Sse2.CompareLessThan(_x2, _width), _1111).AsInt32(), _ff)); var _xpp = Sse2.And(_sx, _xpb); var _xp = Sse41.BlendVariable(_x2, _xpp, _xpb); var p = Avx2.GatherVector128((uint *)(this._ptr + this._stride * y), _xp, 4).AsByte(); var _p0 = Sse2.ConvertToVector128Single(Ssse3.Shuffle(p, _00mask).AsInt32()); var _p1 = Sse2.ConvertToVector128Single(Ssse3.Shuffle(p, _01mask).AsInt32()); var _p2 = Sse2.ConvertToVector128Single(Ssse3.Shuffle(p, _10mask).AsInt32()); var _p3 = Sse2.ConvertToVector128Single(Ssse3.Shuffle(p, _11mask).AsInt32()); var _w0 = Sse.Shuffle(_w, _w, 0); var _w1 = Sse.Shuffle(_w, _w, 0b01010101); var _w2 = Sse.Shuffle(_w, _w, 0b10101010); var _w3 = Sse.Shuffle(_w, _w, 0b11111111); var rgbaf = Sse.Add(Sse.Add(Sse.Multiply(_p0, _w0), Sse.Multiply(_p1, _w1)), Sse.Add(Sse.Multiply(_p2, _w2), Sse.Multiply(_p3, _w3))); Sse2.Store(tmpPos + x * 4, rgbaf); } }); Parallel.For(0, rtnImage.height, (y) => { float py = (y * scaleY); int sy = (int)py; uint *store = stackalloc uint[4]; var _py = Vector128.CreateScalar(py); _py = Sse.Shuffle(_py, _py, 0); var _sy = Vector128.CreateScalar(sy); _sy = Sse2.Shuffle(_sy, 0); var _height = Vector128.CreateScalar(this.height); _height = Sse2.Shuffle(_height, 0); var _y2 = Sse2.Add(_sy, _1012); var _d = Sse.And(Sse.Subtract(_py, Sse2.ConvertToVector128Single(_y2)), _7f); var _d2 = Sse.Multiply(_d, _d); var _d3 = Sse.Multiply(_d2, _d); var w1 = Sse.Add(_1111f, Sse.Subtract(_d3, Sse.Multiply(_2222f, _d2))); var w2 = Sse.Subtract(Sse.Subtract(Sse.Add(_4444f, Sse.Multiply(_5555f, _d2)), Sse.Multiply(_d, _8888f)), _d3); var wb = Sse2.CompareGreaterThan(_d, _1111f); var _w = Sse41.BlendVariable(w1, w2, wb); var _ypb = Sse2.Or(Sse2.CompareLessThan(_y2, _0000), Sse41.MultiplyLow(Sse2.AndNot(Sse2.CompareLessThan(_y2, _height), _1111).AsInt32(), _ff)); var _ypp = Sse2.And(_sy, _ypb); var _yp = Sse41.BlendVariable(_y2, _ypp, _ypb); var _yps = Sse41.MultiplyLow(_yp, _stride); var _yp0 = Sse2.Add(Sse2.Shuffle(_yps, 0), _0123i); var _yp1 = Sse2.Add(Sse2.Shuffle(_yps, 0b01010101), _0123i); var _yp2 = Sse2.Add(Sse2.Shuffle(_yps, 0b10101010), _0123i); var _yp3 = Sse2.Add(Sse2.Shuffle(_yps, 0b11111111), _0123i); uint *rtn = (uint *)(rtnImage._ptr + rtnImage._stride * y); for (int x = 0; x < rtnImage.width; x++) { var _p0 = Avx2.GatherVector128((float *)(tmp), _yp0, 4); var _p1 = Avx2.GatherVector128((float *)(tmp), _yp1, 4); var _p2 = Avx2.GatherVector128((float *)(tmp), _yp2, 4); var _p3 = Avx2.GatherVector128((float *)(tmp), _yp3, 4); var _w0 = Sse.Shuffle(_w, _w, 0); var _w1 = Sse.Shuffle(_w, _w, 0b01010101); var _w2 = Sse.Shuffle(_w, _w, 0b10101010); var _w3 = Sse.Shuffle(_w, _w, 0b11111111); var rgbaf = Sse.Add(Sse.Add(Sse.Multiply(_p0, _w0), Sse.Multiply(_p1, _w1)), Sse.Add(Sse.Multiply(_p2, _w2), Sse.Multiply(_p3, _w3))); var _b0 = Sse.CompareLessThan(rgbaf, _0000f); rgbaf = Sse41.BlendVariable(rgbaf, _0000f, _b0); var _b1 = Sse.CompareGreaterThan(rgbaf, _255f); rgbaf = Sse41.BlendVariable(rgbaf, _255f, _b1); var rgbab = Sse2.ConvertToVector128Int32(rgbaf).AsByte(); var rgba = Ssse3.Shuffle(rgbab, _vmask).AsUInt32(); Sse2.Store(store, rgba); _yp0 = Sse2.Add(_yp0, _4444); _yp1 = Sse2.Add(_yp1, _4444); _yp2 = Sse2.Add(_yp2, _4444); _yp3 = Sse2.Add(_yp3, _4444); *rtn = *store; rtn++; } });
public static Vector128 <float> _mm_blendv_ps(Vector128 <float> left, Vector128 <float> right, Vector128 <float> mask) { return(Sse41.BlendVariable(left, right, mask)); }