public void RunClsVarScenario() { TestLibrary.TestFramework.BeginScenario(nameof(RunClsVarScenario)); var result = Sse2.And( _clsVar1, _clsVar2 ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(_clsVar1, _clsVar2, _dataTable.outArrayPtr); }
public void RunBasicScenario_LoadAligned() { TestLibrary.TestFramework.BeginScenario(nameof(RunBasicScenario_LoadAligned)); var result = Sse2.And( Sse2.LoadAlignedVector128((UInt64 *)(_dataTable.inArray1Ptr)), Sse2.LoadAlignedVector128((UInt64 *)(_dataTable.inArray2Ptr)) ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(_dataTable.inArray1Ptr, _dataTable.inArray2Ptr, _dataTable.outArrayPtr); }
public void RunBasicScenario_UnsafeRead() { TestLibrary.TestFramework.BeginScenario(nameof(RunBasicScenario_UnsafeRead)); var result = Sse2.And( Unsafe.Read <Vector128 <UInt64> >(_dataTable.inArray1Ptr), Unsafe.Read <Vector128 <UInt64> >(_dataTable.inArray2Ptr) ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(_dataTable.inArray1Ptr, _dataTable.inArray2Ptr, _dataTable.outArrayPtr); }
public void RunStructLclFldScenario_Load() { TestLibrary.TestFramework.BeginScenario(nameof(RunStructLclFldScenario_Load)); var test = TestStruct.Create(); var result = Sse2.And( Sse2.LoadVector128((Double *)(&test._fld1)), Sse2.LoadVector128((Double *)(&test._fld2)) ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(test._fld1, test._fld2, _dataTable.outArrayPtr); }
private Hit[] RayTraceAVXFaster(Ray ray) { Vector256 <double> dir = (Vector256 <double>)ray.Direction; Vector256 <double> vert0 = (Vector256 <double>)Vert0.Position; Vector256 <double> edge0to1 = (Vector256 <double>)Edge0to1; Vector256 <double> edge0to2 = (Vector256 <double>)Edge0to2; Vector256 <double> offset = Avx.Subtract((Vector256 <double>)ray.Origin, vert0); Vector256 <double> side1 = SIMDHelpers.Cross(offset, edge0to1); Vector256 <double> side2 = SIMDHelpers.Cross(dir, edge0to2); // Prepare all dot products Vector256 <double> uvTemp = Avx.Multiply(offset, side2); // u Vector256 <double> temp = Avx.Multiply(dir, side1); // v Vector256 <double> edge2Temp = Avx.Multiply(edge0to2, side1); Vector256 <double> distTemp = Avx.Multiply(edge0to1, side2); uvTemp = Avx.HorizontalAdd(uvTemp, temp); edge2Temp = Avx.HorizontalAdd(edge2Temp, edge2Temp); distTemp = Avx.HorizontalAdd(distTemp, distTemp); // Complete all dot products for SSE ops Vector128 <double> uvs = SIMDHelpers.Add2(uvTemp); Vector128 <double> dist = SIMDHelpers.Add2(edge2Temp); Vector128 <double> temp1 = SIMDHelpers.Add2(distTemp); Vector128 <double> temp2; // vec2 constants we'll be using later Vector128 <double> ones2 = SIMDHelpers.BroadcastScalar2(1D); Vector128 <double> zeroes2 = new Vector128 <double>(); // Reciprocal of distance along edge0to1 temp1 = Sse2.Divide(ones2, temp1); temp2 = Sse2.CompareOrdered(temp1, temp1); // Remove NaNs from the result, replaced with 0 Vector128 <double> distZeroed = Sse2.And(temp1, temp2); uvs = Sse2.Multiply(uvs, distZeroed); dist = Sse2.Multiply(dist, distZeroed); // compare uvs < 0 and > 1, dist < 0, jump out if any of those conditions are met temp1 = Sse2.CompareLessThan(uvs, zeroes2); temp2 = Mirror ? uvs : Sse3.HorizontalAdd(uvs, uvs); temp2 = Sse2.CompareGreaterThan(temp2, ones2); temp1 = Sse2.Or(temp1, temp2); temp2 = Sse2.CompareLessThan(dist, zeroes2); temp1 = Sse2.Or(temp1, temp2); if (!Avx.TestZ(temp1, temp1)) { return(default);
public static Vector128 <sbyte> CreateEscapingMask( Vector128 <sbyte> sourceValue, Vector128 <sbyte> bitMaskLookup, Vector128 <sbyte> bitPosLookup, Vector128 <sbyte> nibbleMaskSByte, Vector128 <sbyte> nullMaskSByte) { // To check if an input byte needs to be escaped or not, we use a bitmask-lookup. // Therefore we split the input byte into the low- and high-nibble, which will get // the row-/column-index in the bit-mask. // The bitmask-lookup looks like (here for example s_bitMaskLookupBasicLatin): // high-nibble // low-nibble 0 1 2 3 4 5 6 7 8 9 A B C D E F // 0 1 1 0 0 0 0 1 0 1 1 1 1 1 1 1 1 // 1 1 1 0 0 0 0 0 0 1 1 1 1 1 1 1 1 // 2 1 1 1 0 0 0 0 0 1 1 1 1 1 1 1 1 // 3 1 1 0 0 0 0 0 0 1 1 1 1 1 1 1 1 // 4 1 1 0 0 0 0 0 0 1 1 1 1 1 1 1 1 // 5 1 1 0 0 0 0 0 0 1 1 1 1 1 1 1 1 // 6 1 1 1 0 0 0 0 0 1 1 1 1 1 1 1 1 // 7 1 1 1 0 0 0 0 0 1 1 1 1 1 1 1 1 // 8 1 1 0 0 0 0 0 0 1 1 1 1 1 1 1 1 // 9 1 1 0 0 0 0 0 0 1 1 1 1 1 1 1 1 // A 1 1 0 0 0 0 0 0 1 1 1 1 1 1 1 1 // B 1 1 1 0 0 0 0 0 1 1 1 1 1 1 1 1 // C 1 1 0 1 0 1 0 0 1 1 1 1 1 1 1 1 // D 1 1 0 0 0 0 0 0 1 1 1 1 1 1 1 1 // E 1 1 0 1 0 0 0 0 1 1 1 1 1 1 1 1 // F 1 1 0 0 0 0 0 1 1 1 1 1 1 1 1 1 // // where 1 denotes the neeed for escaping, while 0 means no escaping needed. // For high-nibbles in the range 8..F every input needs to be escaped, so we // can omit them in the bit-mask, thus only high-nibbles in the range 0..7 need // to be considered, hence the entries in the bit-mask can be of type byte. // // In the bitmask-lookup for each row (= low-nibble) a bit-mask for the // high-nibbles (= columns) is created. Debug.Assert(Ssse3.IsSupported); Vector128 <sbyte> highNibbles = Sse2.And(Sse2.ShiftRightLogical(sourceValue.AsInt32(), 4).AsSByte(), nibbleMaskSByte); Vector128 <sbyte> lowNibbles = Sse2.And(sourceValue, nibbleMaskSByte); Vector128 <sbyte> bitMask = Ssse3.Shuffle(bitMaskLookup, lowNibbles); Vector128 <sbyte> bitPositions = Ssse3.Shuffle(bitPosLookup, highNibbles); Vector128 <sbyte> mask = Sse2.And(bitPositions, bitMask); mask = Sse2.CompareEqual(nullMaskSByte, Sse2.CompareEqual(nullMaskSByte, mask)); return(mask); }
public static Vector128 <byte> op_Multiply(Vector128 <byte> left, Vector128 <byte> right) { Vector128 <ushort> lowBits = Vector128.Create((ushort)0x00FF); var lowProduct = Sse2.And(lowBits, Sse2.MultiplyLow(left.As <ushort>(), right.As <ushort>())).AsByte(); var highProduct = Sse2.ShiftLeftLogical( Sse2.MultiplyLow( Sse2.ShiftRightLogical(left.As <ushort>(), 8), Sse2.ShiftRightLogical(right.As <ushort>(), 8) ), 8).AsByte(); return(Sse2.Or(lowProduct, highProduct)); }
public void RunStructFldScenario_Load(SimpleBinaryOpTest__AndDouble testClass) { fixed(Vector128 <Double> *pFld1 = &_fld1) fixed(Vector128 <Double> *pFld2 = &_fld2) { var result = Sse2.And( Sse2.LoadVector128((Double *)(pFld1)), Sse2.LoadVector128((Double *)(pFld2)) ); Unsafe.Write(testClass._dataTable.outArrayPtr, result); testClass.ValidateResult(_fld1, _fld2, testClass._dataTable.outArrayPtr); } }
/*========================================================================= ** Returns a reference to the current instance ANDed with value. ** ** Exceptions: ArgumentException if value == null or ** value.Length != this.Length. ** =========================================================================*/ public unsafe BitArray And(BitArray value) { if (value == null) { throw new ArgumentNullException(nameof(value)); } if (Length != value.Length) { throw new ArgumentException(SR.Arg_ArrayLengthsDiffer); } int count = m_array.Length; switch (count) { case 3: m_array[2] &= value.m_array[2]; goto case 2; case 2: m_array[1] &= value.m_array[1]; goto case 1; case 1: m_array[0] &= value.m_array[0]; goto Done; case 0: goto Done; } int i = 0; if (Sse2.IsSupported) { fixed(int *leftPtr = m_array) fixed(int *rightPtr = value.m_array) { for (; i < count - (Vector128 <int> .Count - 1); i += Vector128 <int> .Count) { Vector128 <int> leftVec = Sse2.LoadVector128(leftPtr + i); Vector128 <int> rightVec = Sse2.LoadVector128(rightPtr + i); Sse2.Store(leftPtr + i, Sse2.And(leftVec, rightVec)); } } } for (; i < count; i++) { m_array[i] &= value.m_array[i]; } Done: _version++; return(this); }
private static Vector128 <int> Recursion(Vector128 <int> a, Vector128 <int> b, Vector128 <int> c, Vector128 <int> d) { var y = Sse2.ShiftRightLogical(b, Sr1); var z = Sse2.ShiftRightLogical128BitLane(c, Sr2); var v = Sse2.ShiftLeftLogical(d, Sl1); z = Sse2.Xor(z, a); z = Sse2.Xor(z, v); var x = Sse2.ShiftLeftLogical128BitLane(a, Sl2); y = Sse2.And(y, Sse2ParamMask.si); z = Sse2.Xor(z, x); return(Sse2.Xor(z, y)); }
unsafe private static void denoiseLineSse2(byte *pcurr, byte *pprev, byte *pnext, int cb) { byte *ip = pcurr, pp = pprev, np = pnext; nuint cnt = 0, end = (nuint)cb - (nuint)Vector128 <byte> .Count; var voffset = Vector128.Create((byte)0x80); var vthresh = Vector128.Create(denoiseThreshold); LoopTop: do { var vcurr = Sse2.LoadVector128(ip + cnt); var vprev = Sse2.LoadVector128(pp + cnt); var vnext = Sse2.LoadVector128(np + cnt); var vdiffp = Sse2.Or(Sse2.SubtractSaturate(vcurr, vprev), Sse2.SubtractSaturate(vprev, vcurr)); var vmaskp = Sse2.CompareEqual(Sse2.Max(vdiffp, vthresh), vthresh); var vdiffn = Sse2.Or(Sse2.SubtractSaturate(vcurr, vnext), Sse2.SubtractSaturate(vnext, vcurr)); var vmaskn = Sse2.CompareEqual(Sse2.Max(vdiffn, vthresh), vthresh); var vavgp = Sse2.Average(vcurr, vprev); var vavgn = Sse2.Average(vcurr, vnext); var voutval = Sse2.Average(HWIntrinsics.BlendVariable(vavgn, vavgp, vmaskp), HWIntrinsics.BlendVariable(vavgp, vavgn, vmaskn)); var voutmsk = Sse2.Or(vmaskp, vmaskn); voutval = Sse2.Average(voutval, HWIntrinsics.BlendVariable(voutval, Sse2.Average(vprev, vnext), Sse2.And(vmaskp, vmaskn))); var vcurrs = Sse2.Xor(vcurr, voffset).AsSByte(); var vprevs = Sse2.Xor(vprev, voffset).AsSByte(); var vnexts = Sse2.Xor(vnext, voffset).AsSByte(); var vsurlt = Sse2.And(Sse2.CompareGreaterThan(vcurrs, vprevs), Sse2.CompareGreaterThan(vcurrs, vnexts)); var vsurgt = Sse2.And(Sse2.CompareGreaterThan(vprevs, vcurrs), Sse2.CompareGreaterThan(vnexts, vcurrs)); voutmsk = Sse2.And(voutmsk, Sse2.Or(vsurlt, vsurgt).AsByte()); voutval = HWIntrinsics.BlendVariable(vcurr, voutval, voutmsk); Sse2.Store(ip + cnt, voutval); cnt += (nuint)Vector128 <byte> .Count; } while (cnt <= end); if (cnt < end + (nuint)Vector128 <byte> .Count) { cnt = end; goto LoopTop; } }
public void RunClsVarScenario_Load() { TestLibrary.TestFramework.BeginScenario(nameof(RunClsVarScenario_Load)); fixed(Vector128 <UInt32> *pClsVar1 = &_clsVar1) fixed(Vector128 <UInt32> *pClsVar2 = &_clsVar2) { var result = Sse2.And( Sse2.LoadVector128((UInt32 *)(pClsVar1)), Sse2.LoadVector128((UInt32 *)(pClsVar2)) ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(_clsVar1, _clsVar2, _dataTable.outArrayPtr); } }
static void do_recursion(ref FloatW128 r, ref FloatW128 a, ref FloatW128 b, ref FloatW128 u) { Vector128 <int> v, w, x, y, z; x = a.si; z = Sse2.ShiftLeftLogical(x.AsInt64(), DSFMT_SL1).AsInt32(); y = Sse2.Shuffle(u.si, SSE2_SHUFF); z = Sse2.Xor(z, b.si); y = Sse2.Xor(y, z); v = Sse2.ShiftRightLogical(y.AsUInt64(), DSFMT_SR).AsInt32(); w = Sse2.And(y, sse2_param_mask.i128); v = Sse2.Xor(v, x); v = Sse2.Xor(v, w); r.si = v; u.si = y; }
public void RunClassFldScenario_Load() { TestLibrary.TestFramework.BeginScenario(nameof(RunClassFldScenario_Load)); fixed(Vector128 <Double> *pFld1 = &_fld1) fixed(Vector128 <Double> *pFld2 = &_fld2) { var result = Sse2.And( Sse2.LoadVector128((Double *)(pFld1)), Sse2.LoadVector128((Double *)(pFld2)) ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(_fld1, _fld2, _dataTable.outArrayPtr); } }
private static unsafe int FillBuffer(ReadOnlySpan <char> input) { int count = Math.Min(LineBuffer.Length, input.Length); int i = 0; fixed(char *buffer = LineBuffer, pInput = input) { if (Sse2.IsSupported && count >= Vector128 <ushort> .Count) { Vector128 <ushort> Space = Vector128.Create(SpaceCharUShort); //Space character do { var data = Sse2.LoadVector128((ushort *)pInput + i); var comp = Vector128 <ushort> .Zero; comp = Sse2.CompareEqual(comp, data); if (Sse41.IsSupported) { data = Sse41.BlendVariable(data, Space, comp); } else { comp = Sse2.And(comp, Space); data = Sse2.Or(data, comp); //Elements being replaced are already 0'ed } Sse2.Store((ushort *)buffer + i, data); i += Vector128 <ushort> .Count; }while ((count - i) >= Vector128 <ushort> .Count); } while (i < count) { char tmp = pInput[i]; buffer[i] = tmp == 0 ? ' ' : tmp; i += 1; } return(count); } }
public void UseSse3_Unsafe(uint value) { char[] buffer = _buffer; _ = buffer.Length; // elide future null checks // _ = buffer[7]; // elide future bounds checks uint tupleNumber = value; // These must be explicity typed as ReadOnlySpan<byte> // They then become a non-allocating mappings to the data section of the assembly. // This uses C# compiler's ability to refer to static data directly. For more information see https://vcsjones.dev/2019/02/01/csharp-readonly-span-bytes-static ReadOnlySpan <byte> shuffleMaskData = new byte[16] { 0xF, 0xF, 3, 0xF, 0xF, 0xF, 2, 0xF, 0xF, 0xF, 1, 0xF, 0xF, 0xF, 0, 0xF }; ReadOnlySpan <byte> asciiUpperCaseData = new byte[16] { (byte)'0', (byte)'1', (byte)'2', (byte)'3', (byte)'4', (byte)'5', (byte)'6', (byte)'7', (byte)'8', (byte)'9', (byte)'A', (byte)'B', (byte)'C', (byte)'D', (byte)'E', (byte)'F' }; // Load from data section memory into Vector128 registers var shuffleMask = Unsafe.ReadUnaligned <Vector128 <byte> >(ref MemoryMarshal.GetReference(shuffleMaskData)); var asciiUpperCase = Unsafe.ReadUnaligned <Vector128 <byte> >(ref MemoryMarshal.GetReference(asciiUpperCaseData)); var lowNibbles = Ssse3.Shuffle(Vector128.CreateScalarUnsafe(tupleNumber).AsByte(), shuffleMask); var highNibbles = Sse2.ShiftRightLogical(Sse2.ShiftRightLogical128BitLane(lowNibbles, 2).AsInt32(), 4).AsByte(); var indices = Sse2.And(Sse2.Or(lowNibbles, highNibbles), Vector128.Create((byte)0xF)); // Lookup the hex values at the positions of the indices var hex = Ssse3.Shuffle(asciiUpperCase, indices); // The high bytes (0x00) of the chars have also been converted to ascii hex '0', so clear them out. hex = Sse2.And(hex, Vector128.Create((ushort)0xFF).AsByte()); // This generates much more efficient asm than fixing the buffer and using // Sse2.Store((byte*)(p + i), chars.AsByte()); Unsafe.WriteUnaligned( ref Unsafe.As <char, byte>( ref MemoryMarshal.GetArrayDataReference(buffer)), hex); }
private static unsafe bool IsNoneOpaque32Bytes(byte *src, int i) { Vector128 <byte> a0 = Sse2.LoadVector128(src + i).AsByte(); Vector128 <byte> a1 = Sse2.LoadVector128(src + i + 16).AsByte(); Vector128 <int> b0 = Sse2.And(a0, AlphaMask).AsInt32(); Vector128 <int> b1 = Sse2.And(a1, AlphaMask).AsInt32(); Vector128 <short> c = Sse2.PackSignedSaturate(b0, b1).AsInt16(); Vector128 <byte> d = Sse2.PackSignedSaturate(c, c).AsByte(); Vector128 <byte> bits = Sse2.CompareEqual(d, All0x80); int mask = Sse2.MoveMask(bits); if (mask != 0xFFFF) { return(true); } return(false); }
public void RunClassLclFldScenario_Load() { TestLibrary.TestFramework.BeginScenario(nameof(RunClassLclFldScenario_Load)); var test = new SimpleBinaryOpTest__AndDouble(); fixed(Vector128 <Double> *pFld1 = &test._fld1) fixed(Vector128 <Double> *pFld2 = &test._fld2) { var result = Sse2.And( Sse2.LoadVector128((Double *)(pFld1)), Sse2.LoadVector128((Double *)(pFld2)) ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(test._fld1, test._fld2, _dataTable.outArrayPtr); } }
public static Vector128 <double> ConditionalSelectBitwise(Vector128 <double> selector, Vector128 <double> ifTrue, Vector128 <double> ifFalse) { // This implementation is based on the DirectX Math Library XMVector4NotEqual method // https://github.com/microsoft/DirectXMath/blob/master/Inc/DirectXMathVector.inl if (AdvSimd.IsSupported) { return(AdvSimd.BitwiseSelect(selector, ifTrue, ifFalse)); } else if (Sse2.IsSupported) { return(Sse2.Or(Sse2.And(ifTrue, selector), Sse2.AndNot(selector, ifFalse))); } else { // Redundant test so we won't prejit remainder of this method on platforms without AdvSimd. throw new PlatformNotSupportedException(); } }
private unsafe static void BCnDecodeTileAlpha(Span <byte> output, Span <byte> rPal, ulong rI) { if (Avx2.IsSupported) { Span <Vector128 <byte> > outputAsVector128 = MemoryMarshal.Cast <byte, Vector128 <byte> >(output); Vector128 <uint> shifts = Vector128.Create(0u, 3u, 6u, 9u); Vector128 <uint> masks = Vector128.Create(7u); Vector128 <byte> vClut; fixed(byte *pRPal = rPal) { vClut = Sse2.LoadScalarVector128((ulong *)pRPal).AsByte(); } Vector128 <uint> indices0 = Vector128.Create((uint)rI); Vector128 <uint> indices1 = Vector128.Create((uint)(rI >> 24)); Vector128 <uint> indices00 = Avx2.ShiftRightLogicalVariable(indices0, shifts); Vector128 <uint> indices10 = Avx2.ShiftRightLogicalVariable(indices1, shifts); Vector128 <uint> indices01 = Sse2.ShiftRightLogical(indices00, 12); Vector128 <uint> indices11 = Sse2.ShiftRightLogical(indices10, 12); indices00 = Sse2.And(indices00, masks); indices10 = Sse2.And(indices10, masks); indices01 = Sse2.And(indices01, masks); indices11 = Sse2.And(indices11, masks); Vector128 <ushort> indicesW0 = Sse41.PackUnsignedSaturate(indices00.AsInt32(), indices01.AsInt32()); Vector128 <ushort> indicesW1 = Sse41.PackUnsignedSaturate(indices10.AsInt32(), indices11.AsInt32()); Vector128 <byte> indices = Sse2.PackUnsignedSaturate(indicesW0.AsInt16(), indicesW1.AsInt16()); outputAsVector128[0] = Ssse3.Shuffle(vClut, indices); } else { for (int i = 0; i < BlockWidth * BlockHeight; i++, rI >>= 3) { output[i] = rPal[(int)(rI & 7)]; } } }
public static unsafe void ToUpperASCIIInPlace_SIMD(string text) { //0b_01111111_11011111_11111111_11011111; const int upperIntOffset = 2145386463; const int upperCharOffset = 95; Vector128 <int> vresult = Vector128 <int> .Zero; Vector128 <int> add = Vector128.Create(upperIntOffset); var len = text.Length; fixed(char *pSource = text) { int i = 0; int lastBlockIndex = len - (len % 8); while (i < lastBlockIndex) { int *c = (int *)(pSource + i); vresult = Sse2.LoadVector128(c); //0b_01111111_11011111_11111111_11011111; vresult = Sse2.And(vresult, add); Sse2.Store(c, vresult); i += 8; } while (i < len) { char *c = (char *)(pSource + i); * c = (Char)(*c & upperCharOffset); i += 1; } } }
public static unsafe int CountEvenSIMD(int[] numbers) { int counter = 0; int len = numbers.Length; fixed(int *num = numbers) { Vector128 <int> vresult = Vector128 <int> .Zero; Vector128 <int> ones = Vector128.Create(1); int i = 0; int lastBlockIndex = len - (len % 4); while (i < lastBlockIndex) { var vec = Sse2.LoadVector128(num + i); var odds = Sse2.And(vec, ones); vresult = Sse2.Add(vresult, odds); i += 4; } vresult = Ssse3.HorizontalAdd(vresult, vresult); vresult = Ssse3.HorizontalAdd(vresult, vresult); counter = vresult.ToScalar(); while (i < len) { var odd = numbers[i] & 1; counter += odd; i += 1; } } return(numbers.Length - counter); }
public unsafe static void Run(World world) { if (!Sse.IsSupported || !Sse2.IsSupported) { throw new Exception("Your processor must support SSE and SSE2 to run this."); } var charCount = world.AllCharacters.Count; var chars = stackalloc CharData[charCount]; for (var i = 0; i < charCount; i++) { var characterActor = world.AllCharacters[i]; var allegianceComponent = characterActor.FindComponent <AllegianceComponent>(); chars[i] = new CharData { X = characterActor.Position.X, Y = characterActor.Position.Y, Z = characterActor.Position.Z, Allegiance = allegianceComponent.Allegiance }; } var doorData = world.DoorData; var doorCount = doorData.Count; for (var d = 0; d < doorCount; d += 4) { var doorX = Sse.LoadAlignedVector128(doorData.X.AlignedPointer + d); var doorY = Sse.LoadAlignedVector128(doorData.Y.AlignedPointer + d); var doorZ = Sse.LoadAlignedVector128(doorData.Z.AlignedPointer + d); var doorR2 = Sse.LoadAlignedVector128(doorData.RadiusSquared.AlignedPointer + d); var doorA = Sse2.LoadAlignedVector128(doorData.Allegiance.AlignedPointer + d); var state = Vector128 <uint> .Zero; for (var cc = 0; cc < charCount; cc++) { ref var c = ref chars[cc]; var charX = Vector128.Create(c.X); var charY = Vector128.Create(c.Y); var charZ = Vector128.Create(c.Z); var charA = Vector128.Create(c.Allegiance); var ddx = Sse.Subtract(doorX, charX); var ddy = Sse.Subtract(doorY, charY); var ddz = Sse.Subtract(doorZ, charZ); var dtx = Sse.Multiply(ddx, ddx); var dty = Sse.Multiply(ddy, ddy); var dtz = Sse.Multiply(ddz, ddz); var dst2 = Sse.Add(Sse.Add(dtx, dty), dtz); var rmask = Sse.CompareLessThanOrEqual(dst2, doorR2); var amask = Sse2.CompareEqual(charA, doorA); var mask = Sse2.And(rmask.AsUInt32(), amask); state = Sse2.Or(mask, state); } Sse2.StoreAligned(doorData.ShouldBeOpen.AlignedPointer + d, state); }
public static unsafe Vector128 <byte> End(ref State state, Span <byte> store128) { long Len = state.TotalLengthInBytes; Vector128 <byte> xmm0 = state.xmm0; Vector128 <byte> xmm1 = state.xmm1; Vector128 <byte> xmm2 = state.xmm2; Vector128 <byte> xmm3 = state.xmm3; Vector128 <byte> xmm4 = state.xmm4; Vector128 <byte> xmm5 = state.xmm5; Vector128 <byte> xmm6 = state.xmm6; Vector128 <byte> xmm7 = state.xmm7; Vector128 <byte> xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15; fixed(byte *rax = state.Buffer) { xmm9 = Vector128 <byte> .Zero; xmm11 = Vector128 <byte> .Zero; byte *Last = (byte *)rax + (Len & 0xf0); long Len8 = (Len & 0xf); if (Len8 > 0) { fixed(byte *MeowMaskLen = s_meowMaskLen) { xmm8 = Sse2.LoadVector128(&MeowMaskLen[0x10 - Len8]); } xmm9 = Sse2.LoadVector128(Last); xmm9 = Sse2.And(xmm9, xmm8); } if ((Len & 0x10) != 0) { xmm11 = xmm9; xmm9 = Sse2.LoadVector128(Last - 0x10); } xmm8 = xmm9; xmm10 = xmm9; xmm8 = Ssse3.AlignRight(xmm8, xmm11, 15); xmm10 = Ssse3.AlignRight(xmm10, xmm11, 1); xmm12 = Vector128 <byte> .Zero; xmm13 = Vector128 <byte> .Zero; xmm14 = Vector128 <byte> .Zero; xmm15 = Vector128.Create((ulong)Len, 0).AsByte(); xmm12 = Ssse3.AlignRight(xmm12, xmm15, 15); xmm14 = Ssse3.AlignRight(xmm14, xmm15, 1); #if MEOW_DUMP MEOW_DUMP_STATE("PostBlocks", xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7); MEOW_DUMP_STATE("Residuals", xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15); #endif // NOTE(casey): To maintain the mix-down pattern, we always Meow Mix the less-than-32-byte residual, even if it was empty MEOW_MIX_REG(ref xmm0, ref xmm4, ref xmm6, ref xmm1, ref xmm2, xmm8, xmm9, xmm10, xmm11); // NOTE(casey): Append the length, to avoid problems with our 32-byte padding MEOW_MIX_REG(ref xmm1, ref xmm5, ref xmm7, ref xmm2, ref xmm3, xmm12, xmm13, xmm14, xmm15); #if MEOW_DUMP MEOW_DUMP_STATE("PostAppend", xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7); #endif // // NOTE(casey): Hash all full 32-byte blocks // long LaneCount = (Len >> 5) & 0x7; if (LaneCount == 0) { goto MixDown; } MEOW_MIX(ref xmm2, ref xmm6, ref xmm0, ref xmm3, ref xmm4, rax + 0x00); --LaneCount; if (LaneCount == 0) { goto MixDown; } MEOW_MIX(ref xmm3, ref xmm7, ref xmm1, ref xmm4, ref xmm5, rax + 0x20); --LaneCount; if (LaneCount == 0) { goto MixDown; } MEOW_MIX(ref xmm4, ref xmm0, ref xmm2, ref xmm5, ref xmm6, rax + 0x40); --LaneCount; if (LaneCount == 0) { goto MixDown; } MEOW_MIX(ref xmm5, ref xmm1, ref xmm3, ref xmm6, ref xmm7, rax + 0x60); --LaneCount; if (LaneCount == 0) { goto MixDown; } MEOW_MIX(ref xmm6, ref xmm2, ref xmm4, ref xmm7, ref xmm0, rax + 0x80); --LaneCount; if (LaneCount == 0) { goto MixDown; } MEOW_MIX(ref xmm7, ref xmm3, ref xmm5, ref xmm0, ref xmm1, rax + 0xa0); --LaneCount; if (LaneCount == 0) { goto MixDown; } MEOW_MIX(ref xmm0, ref xmm4, ref xmm6, ref xmm1, ref xmm2, rax + 0xc0); --LaneCount; // // NOTE(casey): Mix the eight lanes down to one 128-bit hash // MixDown: #if MEOW_DUMP MEOW_DUMP_STATE("PostLanes", xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7); #endif MEOW_SHUFFLE(ref xmm0, ref xmm1, xmm2, ref xmm4, ref xmm5, xmm6); MEOW_SHUFFLE(ref xmm1, ref xmm2, xmm3, ref xmm5, ref xmm6, xmm7); MEOW_SHUFFLE(ref xmm2, ref xmm3, xmm4, ref xmm6, ref xmm7, xmm0); MEOW_SHUFFLE(ref xmm3, ref xmm4, xmm5, ref xmm7, ref xmm0, xmm1); MEOW_SHUFFLE(ref xmm4, ref xmm5, xmm6, ref xmm0, ref xmm1, xmm2); MEOW_SHUFFLE(ref xmm5, ref xmm6, xmm7, ref xmm1, ref xmm2, xmm3); MEOW_SHUFFLE(ref xmm6, ref xmm7, xmm0, ref xmm2, ref xmm3, xmm4); MEOW_SHUFFLE(ref xmm7, ref xmm0, xmm1, ref xmm3, ref xmm4, xmm5); MEOW_SHUFFLE(ref xmm0, ref xmm1, xmm2, ref xmm4, ref xmm5, xmm6); MEOW_SHUFFLE(ref xmm1, ref xmm2, xmm3, ref xmm5, ref xmm6, xmm7); MEOW_SHUFFLE(ref xmm2, ref xmm3, xmm4, ref xmm6, ref xmm7, xmm0); MEOW_SHUFFLE(ref xmm3, ref xmm4, xmm5, ref xmm7, ref xmm0, xmm1); #if MEOW_DUMP MEOW_DUMP_STATE("PostMix", xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7); #endif if (store128 != null) { fixed(byte *store128Ptr = store128) { Sse2.Store(store128Ptr + 0x00, xmm0); Sse2.Store(store128Ptr + 0x10, xmm1); Sse2.Store(store128Ptr + 0x20, xmm2); Sse2.Store(store128Ptr + 0x30, xmm3); Sse2.Store(store128Ptr + 0x40, xmm4); Sse2.Store(store128Ptr + 0x50, xmm5); Sse2.Store(store128Ptr + 0x60, xmm6); Sse2.Store(store128Ptr + 0x70, xmm7); } } xmm0 = AddQ(xmm0, xmm2); xmm1 = AddQ(xmm1, xmm3); xmm4 = AddQ(xmm4, xmm6); xmm5 = AddQ(xmm5, xmm7); xmm0 = Sse2.Xor(xmm0, xmm1); xmm4 = Sse2.Xor(xmm4, xmm5); xmm0 = AddQ(xmm0, xmm4); #if MEOW_DUMP MEOW_DUMP_STATE("PostFold", xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7); #endif return(xmm0); } }
// // NOTE(casey): Single block version // public static unsafe Vector128 <byte> Hash(ReadOnlySpan <byte> Seed128Init, ReadOnlySpan <byte> SourceInit) { Vector128 <byte> xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; // NOTE(casey): xmm0-xmm7 are the hash accumulation lanes Vector128 <byte> xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15; // NOTE(casey): xmm8-xmm15 hold values to be appended (residual, length) int Len = SourceInit.Length; fixed(byte *sourceInitPtr = SourceInit) fixed(byte *seedInitPtr = Seed128Init) { byte *rax = sourceInitPtr; byte *rcx = seedInitPtr; // // NOTE(casey): Seed the eight hash registers // xmm0 = Sse2.LoadVector128(rcx + 0x00); xmm1 = Sse2.LoadVector128(rcx + 0x10); xmm2 = Sse2.LoadVector128(rcx + 0x20); xmm3 = Sse2.LoadVector128(rcx + 0x30); xmm4 = Sse2.LoadVector128(rcx + 0x40); xmm5 = Sse2.LoadVector128(rcx + 0x50); xmm6 = Sse2.LoadVector128(rcx + 0x60); xmm7 = Sse2.LoadVector128(rcx + 0x70); // MEOW_DUMP_STATE("Seed", xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, 0); // // NOTE(casey): Hash all full 256-byte blocks // int BlockCount = (SourceInit.Length >> 8); if (BlockCount > MEOW_PREFETCH_LIMIT) { // NOTE(casey): For large input, modern Intel x64's can't hit full speed without prefetching, so we use this loop while (BlockCount-- > 0) { Sse.Prefetch0(rax + MEOW_PREFETCH + 0x00); Sse.Prefetch0(rax + MEOW_PREFETCH + 0x40); Sse.Prefetch0(rax + MEOW_PREFETCH + 0x80); Sse.Prefetch0(rax + MEOW_PREFETCH + 0xc0); MEOW_MIX(ref xmm0, ref xmm4, ref xmm6, ref xmm1, ref xmm2, rax + 0x00); MEOW_MIX(ref xmm1, ref xmm5, ref xmm7, ref xmm2, ref xmm3, rax + 0x20); MEOW_MIX(ref xmm2, ref xmm6, ref xmm0, ref xmm3, ref xmm4, rax + 0x40); MEOW_MIX(ref xmm3, ref xmm7, ref xmm1, ref xmm4, ref xmm5, rax + 0x60); MEOW_MIX(ref xmm4, ref xmm0, ref xmm2, ref xmm5, ref xmm6, rax + 0x80); MEOW_MIX(ref xmm5, ref xmm1, ref xmm3, ref xmm6, ref xmm7, rax + 0xa0); MEOW_MIX(ref xmm6, ref xmm2, ref xmm4, ref xmm7, ref xmm0, rax + 0xc0); MEOW_MIX(ref xmm7, ref xmm3, ref xmm5, ref xmm0, ref xmm1, rax + 0xe0); rax += 0x100; } } else { // NOTE(casey): For small input, modern Intel x64's can't hit full speed _with_ prefetching (because of port pressure), so we use this loop. while (BlockCount-- > 0) { MEOW_MIX(ref xmm0, ref xmm4, ref xmm6, ref xmm1, ref xmm2, rax + 0x00); MEOW_MIX(ref xmm1, ref xmm5, ref xmm7, ref xmm2, ref xmm3, rax + 0x20); MEOW_MIX(ref xmm2, ref xmm6, ref xmm0, ref xmm3, ref xmm4, rax + 0x40); MEOW_MIX(ref xmm3, ref xmm7, ref xmm1, ref xmm4, ref xmm5, rax + 0x60); MEOW_MIX(ref xmm4, ref xmm0, ref xmm2, ref xmm5, ref xmm6, rax + 0x80); MEOW_MIX(ref xmm5, ref xmm1, ref xmm3, ref xmm6, ref xmm7, rax + 0xa0); MEOW_MIX(ref xmm6, ref xmm2, ref xmm4, ref xmm7, ref xmm0, rax + 0xc0); MEOW_MIX(ref xmm7, ref xmm3, ref xmm5, ref xmm0, ref xmm1, rax + 0xe0); rax += 0x100; } } #if MEOW_DUMP MEOW_DUMP_STATE("PostBlocks", xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7); #endif // // NOTE(casey): Load any less-than-32-byte residual // xmm9 = Vector128 <byte> .Zero; xmm11 = Vector128 <byte> .Zero; // // TODO(casey): I need to put more thought into how the end-of-buffer stuff is actually working out here, // because I _think_ it may be possible to remove the first branch (on Len8) and let the mask zero out the // result, but it would take a little thought to make sure it couldn't read off the end of the buffer due // to the & 0xf on the align computation. // // NOTE(casey): First, we have to load the part that is _not_ 16-byte aligned byte *Last = (byte *)sourceInitPtr + (Len & ~0xf); int Len8 = (Len & 0xf); if (Len8 > 0) { // NOTE(casey): Load the mask early fixed(byte *MeowMaskLen = s_meowMaskLen) { xmm8 = Sse2.LoadVector128(&MeowMaskLen[0x10 - Len8]); } byte *LastOk = (byte *)((((ulong)(((byte *)sourceInitPtr) + Len - 1)) | (MEOW_PAGESIZE - 1)) - 16); int Align = (Last > LastOk) ? ((int)(ulong)Last) & 0xf : 0; fixed(byte *MeowShiftAdjust = s_meowShiftAdjust) { xmm10 = Sse2.LoadVector128(&MeowShiftAdjust[Align]); } xmm9 = Sse2.LoadVector128(Last - Align); xmm9 = Ssse3.Shuffle(xmm9, xmm10); // NOTE(jeffr): and off the extra bytes xmm9 = Sse2.And(xmm9, xmm8); } // NOTE(casey): Next, we have to load the part that _is_ 16-byte aligned if ((Len & 0x10) != 0) { xmm11 = xmm9; xmm9 = Sse2.LoadVector128(Last - 0x10); } // // NOTE(casey): Construct the residual and length injests // xmm8 = xmm9; xmm10 = xmm9; xmm8 = Ssse3.AlignRight(xmm8, xmm11, 15); xmm10 = Ssse3.AlignRight(xmm10, xmm11, 1); // NOTE(casey): We have room for a 128-bit nonce and a 64-bit none here, but // the decision was made to leave them zero'd so as not to confuse people // about hwo to use them or what security implications they had. xmm12 = Vector128 <byte> .Zero; xmm13 = Vector128 <byte> .Zero; xmm14 = Vector128 <byte> .Zero; xmm15 = Vector128.Create((ulong)Len, 0).AsByte(); xmm12 = Ssse3.AlignRight(xmm12, xmm15, 15); xmm14 = Ssse3.AlignRight(xmm14, xmm15, 1); #if MEOW_DUMP MEOW_DUMP_STATE("Residuals", xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15); #endif // NOTE(casey): To maintain the mix-down pattern, we always Meow Mix the less-than-32-byte residual, even if it was empty MEOW_MIX_REG(ref xmm0, ref xmm4, ref xmm6, ref xmm1, ref xmm2, xmm8, xmm9, xmm10, xmm11); // NOTE(casey): Append the length, to avoid problems with our 32-byte padding MEOW_MIX_REG(ref xmm1, ref xmm5, ref xmm7, ref xmm2, ref xmm3, xmm12, xmm13, xmm14, xmm15); #if MEOW_DUMP MEOW_DUMP_STATE("PostAppend", xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7); #endif // // NOTE(casey): Hash all full 32-byte blocks // int LaneCount = (Len >> 5) & 0x7; if (LaneCount == 0) { goto MixDown; } MEOW_MIX(ref xmm2, ref xmm6, ref xmm0, ref xmm3, ref xmm4, rax + 0x00); --LaneCount; if (LaneCount == 0) { goto MixDown; } MEOW_MIX(ref xmm3, ref xmm7, ref xmm1, ref xmm4, ref xmm5, rax + 0x20); --LaneCount; if (LaneCount == 0) { goto MixDown; } MEOW_MIX(ref xmm4, ref xmm0, ref xmm2, ref xmm5, ref xmm6, rax + 0x40); --LaneCount; if (LaneCount == 0) { goto MixDown; } MEOW_MIX(ref xmm5, ref xmm1, ref xmm3, ref xmm6, ref xmm7, rax + 0x60); --LaneCount; if (LaneCount == 0) { goto MixDown; } MEOW_MIX(ref xmm6, ref xmm2, ref xmm4, ref xmm7, ref xmm0, rax + 0x80); --LaneCount; if (LaneCount == 0) { goto MixDown; } MEOW_MIX(ref xmm7, ref xmm3, ref xmm5, ref xmm0, ref xmm1, rax + 0xa0); --LaneCount; if (LaneCount == 0) { goto MixDown; } MEOW_MIX(ref xmm0, ref xmm4, ref xmm6, ref xmm1, ref xmm2, rax + 0xc0); --LaneCount; // // NOTE(casey): Mix the eight lanes down to one 128-bit hash // MixDown: #if MEOW_DUMP MEOW_DUMP_STATE("PostLanes", xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7); #endif MEOW_SHUFFLE(ref xmm0, ref xmm1, xmm2, ref xmm4, ref xmm5, xmm6); MEOW_SHUFFLE(ref xmm1, ref xmm2, xmm3, ref xmm5, ref xmm6, xmm7); MEOW_SHUFFLE(ref xmm2, ref xmm3, xmm4, ref xmm6, ref xmm7, xmm0); MEOW_SHUFFLE(ref xmm3, ref xmm4, xmm5, ref xmm7, ref xmm0, xmm1); MEOW_SHUFFLE(ref xmm4, ref xmm5, xmm6, ref xmm0, ref xmm1, xmm2); MEOW_SHUFFLE(ref xmm5, ref xmm6, xmm7, ref xmm1, ref xmm2, xmm3); MEOW_SHUFFLE(ref xmm6, ref xmm7, xmm0, ref xmm2, ref xmm3, xmm4); MEOW_SHUFFLE(ref xmm7, ref xmm0, xmm1, ref xmm3, ref xmm4, xmm5); MEOW_SHUFFLE(ref xmm0, ref xmm1, xmm2, ref xmm4, ref xmm5, xmm6); MEOW_SHUFFLE(ref xmm1, ref xmm2, xmm3, ref xmm5, ref xmm6, xmm7); MEOW_SHUFFLE(ref xmm2, ref xmm3, xmm4, ref xmm6, ref xmm7, xmm0); MEOW_SHUFFLE(ref xmm3, ref xmm4, xmm5, ref xmm7, ref xmm0, xmm1); #if MEOW_DUMP MEOW_DUMP_STATE("PostMix", xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7); #endif xmm0 = AddQ(xmm0, xmm2); xmm1 = AddQ(xmm1, xmm3); xmm4 = AddQ(xmm4, xmm6); xmm5 = AddQ(xmm5, xmm7); xmm0 = Sse2.Xor(xmm0, xmm1); xmm4 = Sse2.Xor(xmm4, xmm5); xmm0 = AddQ(xmm0, xmm4); #if MEOW_DUMP MEOW_DUMP_STATE("PostFold", xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7); #endif return(xmm0); } }
public static Vector128 <short> Divide(this Vector128 <short> dividend, Vector128 <short> divisor) { // Based on https://stackoverflow.com/a/51458507/347870 // Convert to two 32-bit integers Vector128 <int> a_hi_epi32 = Sse2.ShiftRightArithmetic(dividend.AsInt32(), 16); Vector128 <int> a_lo_epi32_shift = Sse2.ShiftLeftLogical(dividend.AsInt32(), 16); Vector128 <int> a_lo_epi32 = Sse2.ShiftRightArithmetic(a_lo_epi32_shift, 16); Vector128 <int> b_hi_epi32 = Sse2.ShiftRightArithmetic(divisor.AsInt32(), 16); Vector128 <int> b_lo_epi32_shift = Sse2.ShiftLeftLogical(divisor.AsInt32(), 16); Vector128 <int> b_lo_epi32 = Sse2.ShiftRightArithmetic(b_lo_epi32_shift, 16); // Convert to 32-bit floats Vector128 <float> a_hi = Sse2.ConvertToVector128Single(a_hi_epi32); Vector128 <float> a_lo = Sse2.ConvertToVector128Single(a_lo_epi32); Vector128 <float> b_hi = Sse2.ConvertToVector128Single(b_hi_epi32); Vector128 <float> b_lo = Sse2.ConvertToVector128Single(b_lo_epi32); // Calculate the reciprocal Vector128 <float> b_hi_rcp = Sse.Reciprocal(b_hi); Vector128 <float> b_lo_rcp = Sse.Reciprocal(b_lo); // Calculate the inverse Vector128 <float> b_hi_inv_1; Vector128 <float> b_lo_inv_1; Vector128 <float> two = Vector128.Create(2.00000051757f); if (Fma.IsSupported) { b_hi_inv_1 = Fma.MultiplyAddNegated(b_hi_rcp, b_hi, two); b_lo_inv_1 = Fma.MultiplyAddNegated(b_lo_rcp, b_lo, two); } else { Vector128 <float> b_mul_hi = Sse.Multiply(b_hi_rcp, b_hi); Vector128 <float> b_mul_lo = Sse.Multiply(b_lo_rcp, b_lo); b_hi_inv_1 = Sse.Subtract(two, b_mul_hi); b_lo_inv_1 = Sse.Subtract(two, b_mul_lo); } // Compensate for the loss Vector128 <float> b_hi_rcp_1 = Sse.Multiply(b_hi_rcp, b_hi_inv_1); Vector128 <float> b_lo_rcp_1 = Sse.Multiply(b_lo_rcp, b_lo_inv_1); // Perform the division by multiplication Vector128 <float> hi = Sse.Multiply(a_hi, b_hi_rcp_1); Vector128 <float> lo = Sse.Multiply(a_lo, b_lo_rcp_1); // Convert back to integers Vector128 <int> hi_epi32 = Sse2.ConvertToVector128Int32WithTruncation(hi); Vector128 <int> lo_epi32 = Sse2.ConvertToVector128Int32WithTruncation(lo); // Zero-out the unnecessary parts Vector128 <int> hi_epi32_shift = Sse2.ShiftLeftLogical(hi_epi32, 16); // Blend the bits, and return if (Sse41.IsSupported) { return(Sse41.Blend(lo_epi32.AsInt16(), hi_epi32_shift.AsInt16(), 0xAA)); } else { Vector128 <int> lo_epi32_mask = Sse2.And(lo_epi32, Vector128.Create((ushort)0xFFFF).AsInt16().AsInt32()); return(Sse2.Or(hi_epi32_shift, lo_epi32_mask).AsInt16()); } }
public static unsafe void ComputeDouble( uint[,] iterations, int startScanline, int increment, double offsetX, double offsetY, double zoom, uint maxIterations, ref bool cancel) { const int stride = 2; int height = iterations.GetLength(0); int width = iterations.GetLength(1); var maxIter = Vector128.Create((double)maxIterations); var limit = Vector128.Create(4.0); var one = Vector128.Create(1.0); var two = Vector128.Create(2.0); var results = stackalloc double[stride]; for (int i = startScanline; i < height && !cancel; i += increment) { for (int j = 0; j < width && !cancel; j += stride) { var c0 = Impl.GetPointCoordinate(j + 0, i, width, height, offsetX, offsetY, zoom); var c1 = Impl.GetPointCoordinate(j + 1, i, width, height, offsetX, offsetY, zoom); var cr = Vector128.Create(c0.X, c1.X); var ci = Vector128.Create(c0.Y, c1.Y); var zr = cr; var zi = ci; var it = Vector128.Create(0.0); for (;;) { var zr2 = Sse2.Multiply(zr, zr); var zi2 = Sse2.Multiply(zi, zi); var squaredMagnitude = Sse2.Add(zr2, zi2); var cond = Sse2.And( Sse2.CompareLessThanOrEqual(squaredMagnitude, limit), Sse2.CompareLessThanOrEqual(it, maxIter)); if (Sse2.MoveMask(cond) == 0) { Sse2.Store(results, it); if (j + 0 < width) { iterations[i, j + 0] = (uint)results[0] % maxIterations; } if (j + 1 < width) { iterations[i, j + 1] = (uint)results[1] % maxIterations; } break; } zi = Sse2.Add(Sse2.Multiply(two, Sse2.Multiply(zr, zi)), ci); zr = Sse2.Add(Sse2.Subtract(zr2, zi2), cr); it = Sse2.Add(it, Sse2.And(one, cond)); } } } }
unsafe private void pruneTree(OctreeNode *ptree, ushort *pfree) { #if HWINTRINSICS var sumsMask = Vector128.Create(0xffffffffu, 0xffffffffu, 0xffffffffu, 0x1fffffffu); var vzero = Vector128 <uint> .Zero; #endif ushort *pnext = pfree; uint level = --leafLevel; for (nuint i = 8; i < maxHistogramSize; i++) { var node = ptree + i; uint nl = OctreeNode.GetLevel(node); if (nl == level) { ushort *children = (ushort *)node; uint * sums = (uint *)(children + 8); #if HWINTRINSICS if (Sse2.IsSupported) { var vsums = Sse2.LoadVector128(sums); for (nuint j = 0; j < 8; j++) { nuint child = children[j]; if (child != 0) { var cnode = ptree + child; uint *csums = (uint *)((ushort *)cnode + 8); var vcsum = Sse2.And(sumsMask, Sse2.LoadVector128(csums)); vsums = Sse2.Add(vsums, vcsum); Sse2.Store((uint *)cnode, vzero); Sse2.Store(csums, vzero); *pnext++ = (ushort)child; } } Sse2.Store((uint *)children, vzero); Sse2.Store(sums, vsums); } else #endif { for (nuint j = 0; j < 8; j++) { nuint child = children[j]; if (child != 0) { var cnode = ptree + child; uint *csums = (uint *)((ushort *)cnode + 8); sums[0] += csums[0]; sums[1] += csums[1]; sums[2] += csums[2]; sums[3] += csums[3] & 0x1fffffff; Unsafe.InitBlockUnaligned(cnode, 0, (uint)Unsafe.SizeOf <OctreeNode>()); *pnext++ = (ushort)child; } } Unsafe.InitBlockUnaligned(children, 0, sizeof(ushort) * 8); } } } *pnext = 0; }
private unsafe static void WriteNv12(ResourceManager rm, Surface input, ref OutputSurfaceConfig config, ref PlaneOffsets offsets) { int gobBlocksInY = 1 << config.OutBlkHeight; bool outLinear = config.OutBlkKind == 0; int width = Math.Min(config.OutLumaWidth + 1, input.Width); int height = Math.Min(config.OutLumaHeight + 1, input.Height); int yStride = GetPitch(config.OutLumaWidth + 1, 1); int dstYIndex = rm.BufferPool.Rent((config.OutLumaHeight + 1) * yStride, out Span <byte> dstY); if (Sse41.IsSupported) { Vector128 <ushort> mask = Vector128.Create(0xffffUL).AsUInt16(); int widthTrunc = width & ~0xf; int strideGap = yStride - width; fixed(Pixel *srcPtr = input.Data) { Pixel *ip = srcPtr; fixed(byte *dstPtr = dstY) { byte *op = dstPtr; for (int y = 0; y < height; y++, ip += input.Width) { int x = 0; for (; x < widthTrunc; x += 16) { byte *baseOffset = (byte *)(ip + (ulong)(uint)x); Vector128 <ushort> pixelp1 = Sse2.LoadVector128((ushort *)baseOffset); Vector128 <ushort> pixelp2 = Sse2.LoadVector128((ushort *)(baseOffset + 0x10)); Vector128 <ushort> pixelp3 = Sse2.LoadVector128((ushort *)(baseOffset + 0x20)); Vector128 <ushort> pixelp4 = Sse2.LoadVector128((ushort *)(baseOffset + 0x30)); Vector128 <ushort> pixelp5 = Sse2.LoadVector128((ushort *)(baseOffset + 0x40)); Vector128 <ushort> pixelp6 = Sse2.LoadVector128((ushort *)(baseOffset + 0x50)); Vector128 <ushort> pixelp7 = Sse2.LoadVector128((ushort *)(baseOffset + 0x60)); Vector128 <ushort> pixelp8 = Sse2.LoadVector128((ushort *)(baseOffset + 0x70)); pixelp1 = Sse2.And(pixelp1, mask); pixelp2 = Sse2.And(pixelp2, mask); pixelp3 = Sse2.And(pixelp3, mask); pixelp4 = Sse2.And(pixelp4, mask); pixelp5 = Sse2.And(pixelp5, mask); pixelp6 = Sse2.And(pixelp6, mask); pixelp7 = Sse2.And(pixelp7, mask); pixelp8 = Sse2.And(pixelp8, mask); Vector128 <ushort> pixelq1 = Sse41.PackUnsignedSaturate(pixelp1.AsInt32(), pixelp2.AsInt32()); Vector128 <ushort> pixelq2 = Sse41.PackUnsignedSaturate(pixelp3.AsInt32(), pixelp4.AsInt32()); Vector128 <ushort> pixelq3 = Sse41.PackUnsignedSaturate(pixelp5.AsInt32(), pixelp6.AsInt32()); Vector128 <ushort> pixelq4 = Sse41.PackUnsignedSaturate(pixelp7.AsInt32(), pixelp8.AsInt32()); pixelq1 = Sse41.PackUnsignedSaturate(pixelq1.AsInt32(), pixelq2.AsInt32()); pixelq2 = Sse41.PackUnsignedSaturate(pixelq3.AsInt32(), pixelq4.AsInt32()); pixelq1 = Sse2.ShiftRightLogical(pixelq1, 2); pixelq2 = Sse2.ShiftRightLogical(pixelq2, 2); Vector128 <byte> pixel = Sse2.PackUnsignedSaturate(pixelq1.AsInt16(), pixelq2.AsInt16()); Sse2.Store(op, pixel); op += 0x10; } for (; x < width; x++) { Pixel *px = ip + (uint)x; *op++ = Downsample(px->R); } op += strideGap; } } } } else { for (int y = 0; y < height; y++) { for (int x = 0; x < width; x++) { dstY[y * yStride + x] = Downsample(input.GetR(x, y)); } } } WriteBuffer( rm, dstY, offsets.LumaOffset, outLinear, config.OutLumaWidth + 1, config.OutLumaHeight + 1, 1, gobBlocksInY); rm.BufferPool.Return(dstYIndex); int uvWidth = Math.Min(config.OutChromaWidth + 1, (width + 1) >> 1); int uvHeight = Math.Min(config.OutChromaHeight + 1, (height + 1) >> 1); int uvStride = GetPitch(config.OutChromaWidth + 1, 2); int dstUvIndex = rm.BufferPool.Rent((config.OutChromaHeight + 1) * uvStride, out Span <byte> dstUv); if (Sse2.IsSupported) { int widthTrunc = uvWidth & ~7; int strideGap = uvStride - uvWidth * 2; fixed(Pixel *srcPtr = input.Data) { Pixel *ip = srcPtr; fixed(byte *dstPtr = dstUv) { byte *op = dstPtr; for (int y = 0; y < uvHeight; y++, ip += input.Width * 2) { int x = 0; for (; x < widthTrunc; x += 8) { byte *baseOffset = (byte *)ip + (ulong)(uint)x * 16; Vector128 <uint> pixel1 = Sse2.LoadScalarVector128((uint *)(baseOffset + 0x02)); Vector128 <uint> pixel2 = Sse2.LoadScalarVector128((uint *)(baseOffset + 0x12)); Vector128 <uint> pixel3 = Sse2.LoadScalarVector128((uint *)(baseOffset + 0x22)); Vector128 <uint> pixel4 = Sse2.LoadScalarVector128((uint *)(baseOffset + 0x32)); Vector128 <uint> pixel5 = Sse2.LoadScalarVector128((uint *)(baseOffset + 0x42)); Vector128 <uint> pixel6 = Sse2.LoadScalarVector128((uint *)(baseOffset + 0x52)); Vector128 <uint> pixel7 = Sse2.LoadScalarVector128((uint *)(baseOffset + 0x62)); Vector128 <uint> pixel8 = Sse2.LoadScalarVector128((uint *)(baseOffset + 0x72)); Vector128 <uint> pixel12 = Sse2.UnpackLow(pixel1, pixel2); Vector128 <uint> pixel34 = Sse2.UnpackLow(pixel3, pixel4); Vector128 <uint> pixel56 = Sse2.UnpackLow(pixel5, pixel6); Vector128 <uint> pixel78 = Sse2.UnpackLow(pixel7, pixel8); Vector128 <ulong> pixel1234 = Sse2.UnpackLow(pixel12.AsUInt64(), pixel34.AsUInt64()); Vector128 <ulong> pixel5678 = Sse2.UnpackLow(pixel56.AsUInt64(), pixel78.AsUInt64()); pixel1234 = Sse2.ShiftRightLogical(pixel1234, 2); pixel5678 = Sse2.ShiftRightLogical(pixel5678, 2); Vector128 <byte> pixel = Sse2.PackUnsignedSaturate(pixel1234.AsInt16(), pixel5678.AsInt16()); Sse2.Store(op, pixel); op += 0x10; } for (; x < uvWidth; x++) { Pixel *px = ip + (uint)(x << 1); *op++ = Downsample(px->G); *op++ = Downsample(px->B); } op += strideGap; } } } } else { for (int y = 0; y < uvHeight; y++) { for (int x = 0; x < uvWidth; x++) { int xx = x << 1; int yy = y << 1; int uvOffs = y * uvStride + xx; dstUv[uvOffs + 0] = Downsample(input.GetG(xx, yy)); dstUv[uvOffs + 1] = Downsample(input.GetB(xx, yy)); } } } WriteBuffer( rm, dstUv, offsets.ChromaUOffset, outLinear, config.OutChromaWidth + 1, config.OutChromaHeight + 1, 2, gobBlocksInY); rm.BufferPool.Return(dstUvIndex); }
private static unsafe uint CalculateSse(uint crc, ReadOnlySpan <byte> buffer) { int chunksize = buffer.Length & ~ChunksizeMask; int length = chunksize; fixed(byte *bufferPtr = buffer) fixed(ulong *k05PolyPtr = K05Poly) { byte *srcPtr = bufferPtr; // There's at least one block of 64. Vector128 <ulong> x1 = Sse2.LoadVector128((ulong *)(srcPtr + 0x00)); Vector128 <ulong> x2 = Sse2.LoadVector128((ulong *)(srcPtr + 0x10)); Vector128 <ulong> x3 = Sse2.LoadVector128((ulong *)(srcPtr + 0x20)); Vector128 <ulong> x4 = Sse2.LoadVector128((ulong *)(srcPtr + 0x30)); Vector128 <ulong> x5; x1 = Sse2.Xor(x1, Sse2.ConvertScalarToVector128UInt32(crc).AsUInt64()); // k1, k2 Vector128 <ulong> x0 = Sse2.LoadVector128(k05PolyPtr + 0x0); srcPtr += 64; length -= 64; // Parallel fold blocks of 64, if any. while (length >= 64) { x5 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x00); Vector128 <ulong> x6 = Pclmulqdq.CarrylessMultiply(x2, x0, 0x00); Vector128 <ulong> x7 = Pclmulqdq.CarrylessMultiply(x3, x0, 0x00); Vector128 <ulong> x8 = Pclmulqdq.CarrylessMultiply(x4, x0, 0x00); x1 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x11); x2 = Pclmulqdq.CarrylessMultiply(x2, x0, 0x11); x3 = Pclmulqdq.CarrylessMultiply(x3, x0, 0x11); x4 = Pclmulqdq.CarrylessMultiply(x4, x0, 0x11); Vector128 <ulong> y5 = Sse2.LoadVector128((ulong *)(srcPtr + 0x00)); Vector128 <ulong> y6 = Sse2.LoadVector128((ulong *)(srcPtr + 0x10)); Vector128 <ulong> y7 = Sse2.LoadVector128((ulong *)(srcPtr + 0x20)); Vector128 <ulong> y8 = Sse2.LoadVector128((ulong *)(srcPtr + 0x30)); x1 = Sse2.Xor(x1, x5); x2 = Sse2.Xor(x2, x6); x3 = Sse2.Xor(x3, x7); x4 = Sse2.Xor(x4, x8); x1 = Sse2.Xor(x1, y5); x2 = Sse2.Xor(x2, y6); x3 = Sse2.Xor(x3, y7); x4 = Sse2.Xor(x4, y8); srcPtr += 64; length -= 64; } // Fold into 128-bits. // k3, k4 x0 = Sse2.LoadVector128(k05PolyPtr + 0x2); x5 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x00); x1 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x11); x1 = Sse2.Xor(x1, x2); x1 = Sse2.Xor(x1, x5); x5 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x00); x1 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x11); x1 = Sse2.Xor(x1, x3); x1 = Sse2.Xor(x1, x5); x5 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x00); x1 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x11); x1 = Sse2.Xor(x1, x4); x1 = Sse2.Xor(x1, x5); // Single fold blocks of 16, if any. while (length >= 16) { x2 = Sse2.LoadVector128((ulong *)srcPtr); x5 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x00); x1 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x11); x1 = Sse2.Xor(x1, x2); x1 = Sse2.Xor(x1, x5); srcPtr += 16; length -= 16; } // Fold 128 - bits to 64 - bits. x2 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x10); x3 = Vector128.Create(~0, 0, ~0, 0).AsUInt64(); // _mm_setr_epi32 on x86 x1 = Sse2.ShiftRightLogical128BitLane(x1, 8); x1 = Sse2.Xor(x1, x2); // k5, k0 x0 = Sse2.LoadScalarVector128(k05PolyPtr + 0x4); x2 = Sse2.ShiftRightLogical128BitLane(x1, 4); x1 = Sse2.And(x1, x3); x1 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x00); x1 = Sse2.Xor(x1, x2); // Barret reduce to 32-bits. // polynomial x0 = Sse2.LoadVector128(k05PolyPtr + 0x6); x2 = Sse2.And(x1, x3); x2 = Pclmulqdq.CarrylessMultiply(x2, x0, 0x10); x2 = Sse2.And(x2, x3); x2 = Pclmulqdq.CarrylessMultiply(x2, x0, 0x00); x1 = Sse2.Xor(x1, x2); crc = (uint)Sse41.Extract(x1.AsInt32(), 1); return(buffer.Length - chunksize == 0 ? crc : CalculateScalar(crc, buffer[chunksize..]));