示例#1
0
        public void RunStructLclFldScenario()
        {
            TestLibrary.TestFramework.BeginScenario(nameof(RunStructLclFldScenario));

            var test   = TestStruct.Create();
            var result = Sse3.AddSubtract(test._fld1, test._fld2);

            Unsafe.Write(_dataTable.outArrayPtr, result);
            ValidateResult(test._fld1, test._fld2, _dataTable.outArrayPtr);
        }
示例#2
0
        public void RunClsVarScenario()
        {
            var result = Sse3.AddSubtract(
                _clsVar1,
                _clsVar2
                );

            Unsafe.Write(_dataTable.outArrayPtr, result);
            ValidateResult(_clsVar1, _clsVar2, _dataTable.outArrayPtr);
        }
        public void RunClassLclFldScenario()
        {
            TestLibrary.TestFramework.BeginScenario(nameof(RunClassLclFldScenario));

            var test   = new HorizontalBinaryOpTest__HorizontalAddSingle();
            var result = Sse3.HorizontalAdd(test._fld1, test._fld2);

            Unsafe.Write(_dataTable.outArrayPtr, result);
            ValidateResult(test._fld1, test._fld2, _dataTable.outArrayPtr);
        }
示例#4
0
        public void RunBasicScenario_UnsafeRead()
        {
            var result = Sse3.AddSubtract(
                Unsafe.Read <Vector128 <Double> >(_dataTable.inArray1Ptr),
                Unsafe.Read <Vector128 <Double> >(_dataTable.inArray2Ptr)
                );

            Unsafe.Write(_dataTable.outArrayPtr, result);
            ValidateResult(_dataTable.inArray1Ptr, _dataTable.inArray2Ptr, _dataTable.outArrayPtr);
        }
示例#5
0
        public void RunBasicScenario_LoadAligned()
        {
            var result = Sse3.AddSubtract(
                Sse2.LoadAlignedVector128((Double *)(_dataTable.inArray1Ptr)),
                Sse2.LoadAlignedVector128((Double *)(_dataTable.inArray2Ptr))
                );

            Unsafe.Write(_dataTable.outArrayPtr, result);
            ValidateResult(_dataTable.inArray1Ptr, _dataTable.inArray2Ptr, _dataTable.outArrayPtr);
        }
示例#6
0
        public void RunBasicScenario_Load()
        {
            var result = Sse3.HorizontalSubtract(
                Sse.LoadVector128((Single *)(_dataTable.inArray1Ptr)),
                Sse.LoadVector128((Single *)(_dataTable.inArray2Ptr))
                );

            Unsafe.Write(_dataTable.outArrayPtr, result);
            ValidateResult(_dataTable.inArray1Ptr, _dataTable.inArray2Ptr, _dataTable.outArrayPtr);
        }
示例#7
0
        public void RunBasicScenario_UnsafeRead()
        {
            var result = Sse3.HorizontalAdd(
                Unsafe.Read <Vector128 <Single> >(_dataTable.inArray1Ptr),
                Unsafe.Read <Vector128 <Single> >(_dataTable.inArray2Ptr)
                );

            Unsafe.Write(_dataTable.outArrayPtr, result);
            ValidateResult(_dataTable.inArray1Ptr, _dataTable.inArray2Ptr, _dataTable.outArrayPtr);
        }
示例#8
0
        public void RunClassLclFldScenario()
        {
            TestLibrary.TestFramework.BeginScenario(nameof(RunClassLclFldScenario));

            var test   = new AlternatingBinaryOpTest__AddSubtractDouble();
            var result = Sse3.AddSubtract(test._fld1, test._fld2);

            Unsafe.Write(_dataTable.outArrayPtr, result);
            ValidateResult(test._fld1, test._fld2, _dataTable.outArrayPtr);
        }
    static float ComputeSumSimd(float *arr, int count)
    {
        // We're just going to assume that the length of the data is a multiple of 4, otherwise we'd have to handle the
        // other cases. It's not hard, but tedious.
        Assert.IsTrue(count % 4 == 0);

        if (Ssse3.IsSsse3Supported)
        {
            // To sum up all values in the array, we split the array into 4 subarrays and store their sums in the variable
            // `sum` below.
            v128 sum = new v128(0f);
            for (int i = 0; i < count; i += 4)
            {
                // Load 4 floats from memory.
                v128 reg = loadu_ps(arr + i);
                sum = add_ps(sum, reg);
            }

            // At this point, we have the sums of 4 subarrays in `sum` and we still need to merge them. SSE3 has a helpful
            // instruction for this:
            sum = Sse3.hadd_ps(sum, sum);
            // Now the first and third lane hold the sum of the first two subarrays and the second and fourth lane contain
            // the sum of the last two subarrays.
            sum = Sse3.hadd_ps(sum, sum);
            // Finally, all four lanes hold the same value (the sum of all subarrays) and we can return the first value
            // as a float.
            return(cvtss_f32(sum));

            // or alternatively, simply write:
            // return sum.Float0 + sum.Float1 + sum.Float2 + sum.Float3;
        }
        else if (IsNeonSupported)
        {
            // Same as above: 4 subarrays to accumulate the sum
            v128 sum = new v128(0f);
            for (int i = 0; i < count; i += 4)
            {
                // Load 4 floats from memory.
                v128 reg = vld1q_f32(arr + i);
                sum = vaddq_f32(sum, reg);
            }
            return(vaddvq_f32(sum));
        }
        else
        {
            // Managed fallback, equivalent to ComputeSum()
            float sum = 0;
            for (int i = 0; i < count; i++)
            {
                sum += arr[i];
            }
            return(sum);
        }
    }
示例#10
0
        public void RunLclVarScenario_LoadAligned()
        {
            TestLibrary.TestFramework.BeginScenario(nameof(RunLclVarScenario_LoadAligned));

            var op1    = Sse2.LoadAlignedVector128((Double *)(_dataTable.inArray1Ptr));
            var op2    = Sse2.LoadAlignedVector128((Double *)(_dataTable.inArray2Ptr));
            var result = Sse3.AddSubtract(op1, op2);

            Unsafe.Write(_dataTable.outArrayPtr, result);
            ValidateResult(op1, op2, _dataTable.outArrayPtr);
        }
示例#11
0
        public void RunLclVarScenario_Load()
        {
            TestLibrary.TestFramework.BeginScenario(nameof(RunLclVarScenario_Load));

            var op1    = Sse.LoadVector128((Single *)(_dataTable.inArray1Ptr));
            var op2    = Sse.LoadVector128((Single *)(_dataTable.inArray2Ptr));
            var result = Sse3.HorizontalAdd(op1, op2);

            Unsafe.Write(_dataTable.outArrayPtr, result);
            ValidateResult(op1, op2, _dataTable.outArrayPtr);
        }
示例#12
0
        public void RunLclVarScenario_LoadAligned()
        {
            TestLibrary.TestFramework.BeginScenario(nameof(RunLclVarScenario_LoadAligned));

            var left   = Sse.LoadAlignedVector128((Single *)(_dataTable.inArray1Ptr));
            var right  = Sse.LoadAlignedVector128((Single *)(_dataTable.inArray2Ptr));
            var result = Sse3.HorizontalSubtract(left, right);

            Unsafe.Write(_dataTable.outArrayPtr, result);
            ValidateResult(left, right, _dataTable.outArrayPtr);
        }
        public void RunLclVarScenario_UnsafeRead()
        {
            TestLibrary.TestFramework.BeginScenario(nameof(RunLclVarScenario_UnsafeRead));

            var op1    = Unsafe.Read <Vector128 <Double> >(_dataTable.inArray1Ptr);
            var op2    = Unsafe.Read <Vector128 <Double> >(_dataTable.inArray2Ptr);
            var result = Sse3.HorizontalSubtract(op1, op2);

            Unsafe.Write(_dataTable.outArrayPtr, result);
            ValidateResult(op1, op2, _dataTable.outArrayPtr);
        }
示例#14
0
        public static Vector4F HorizontalAdd(Vector4FParam1_3 left, Vector4FParam1_3 right)
        {
            if (Sse3.IsSupported)
            {
                return(Sse3.HorizontalAdd(left, right));
            }

            // TODO can Sse be used over the software fallback?

            return(SoftwareFallbacks.SoftwareFallbacksVector4F.HorizontalAdd_Software(left, right));
        }
示例#15
0
        public void RunLclVarScenario_UnsafeRead()
        {
            TestLibrary.TestFramework.BeginScenario(nameof(RunLclVarScenario_UnsafeRead));

            var left   = Unsafe.Read <Vector128 <Double> >(_dataTable.inArray1Ptr);
            var right  = Unsafe.Read <Vector128 <Double> >(_dataTable.inArray2Ptr);
            var result = Sse3.HorizontalAdd(left, right);

            Unsafe.Write(_dataTable.outArrayPtr, result);
            ValidateResult(left, right, _dataTable.outArrayPtr);
        }
示例#16
0
        public void RunBasicScenario_Load()
        {
            TestLibrary.TestFramework.BeginScenario(nameof(RunBasicScenario_Load));

            var result = Sse3.HorizontalSubtract(
                Sse.LoadVector128((Single *)(_dataTable.inArray1Ptr)),
                Sse.LoadVector128((Single *)(_dataTable.inArray2Ptr))
                );

            Unsafe.Write(_dataTable.outArrayPtr, result);
            ValidateResult(_dataTable.inArray1Ptr, _dataTable.inArray2Ptr, _dataTable.outArrayPtr);
        }
示例#17
0
        public void RunClsVarScenario()
        {
            TestLibrary.TestFramework.BeginScenario(nameof(RunClsVarScenario));

            var result = Sse3.HorizontalSubtract(
                _clsVar1,
                _clsVar2
                );

            Unsafe.Write(_dataTable.outArrayPtr, result);
            ValidateResult(_clsVar1, _clsVar2, _dataTable.outArrayPtr);
        }
示例#18
0
        public void RunBasicScenario_UnsafeRead()
        {
            TestLibrary.TestFramework.BeginScenario(nameof(RunBasicScenario_UnsafeRead));

            var result = Sse3.HorizontalAdd(
                Unsafe.Read <Vector128 <Single> >(_dataTable.inArray1Ptr),
                Unsafe.Read <Vector128 <Single> >(_dataTable.inArray2Ptr)
                );

            Unsafe.Write(_dataTable.outArrayPtr, result);
            ValidateResult(_dataTable.inArray1Ptr, _dataTable.inArray2Ptr, _dataTable.outArrayPtr);
        }
示例#19
0
        public static double2 subadd(double2 a, double2 b)
        {
            if (Sse3.IsSse3Supported)
            {
                v128 temp = Sse3.addsub_pd(*(v128 *)&a, *(v128 *)&b);

                return(*(double2 *)&temp);
            }
            else
            {
                return(a - math.select(b, -b, new bool2(false, true)));
            }
        }
示例#20
0
        public static float4 subadd(float4 a, float4 b)
        {
            if (Sse3.IsSse3Supported)
            {
                v128 temp = Sse3.addsub_ps(*(v128 *)&a, *(v128 *)&b);

                return(*(float4 *)&temp);
            }
            else
            {
                return(a - math.select(b, -b, new bool4(false, true, false, true)));
            }
        }
示例#21
0
        public void RunStructLclFldScenario_Load()
        {
            TestLibrary.TestFramework.BeginScenario(nameof(RunStructLclFldScenario_Load));

            var test   = TestStruct.Create();
            var result = Sse3.HorizontalAdd(
                Sse.LoadVector128((Single *)(&test._fld1)),
                Sse.LoadVector128((Single *)(&test._fld2))
                );

            Unsafe.Write(_dataTable.outArrayPtr, result);
            ValidateResult(test._fld1, test._fld2, _dataTable.outArrayPtr);
        }
示例#22
0
        private Hit[] RayTraceAVXFaster(Ray ray)
        {
            Vector256 <double> dir      = (Vector256 <double>)ray.Direction;
            Vector256 <double> vert0    = (Vector256 <double>)Vert0.Position;
            Vector256 <double> edge0to1 = (Vector256 <double>)Edge0to1;
            Vector256 <double> edge0to2 = (Vector256 <double>)Edge0to2;

            Vector256 <double> offset = Avx.Subtract((Vector256 <double>)ray.Origin, vert0);
            Vector256 <double> side1  = SIMDHelpers.Cross(offset, edge0to1);
            Vector256 <double> side2  = SIMDHelpers.Cross(dir, edge0to2);

            // Prepare all dot products
            Vector256 <double> uvTemp    = Avx.Multiply(offset, side2);         // u
            Vector256 <double> temp      = Avx.Multiply(dir, side1);            // v
            Vector256 <double> edge2Temp = Avx.Multiply(edge0to2, side1);
            Vector256 <double> distTemp  = Avx.Multiply(edge0to1, side2);

            uvTemp    = Avx.HorizontalAdd(uvTemp, temp);
            edge2Temp = Avx.HorizontalAdd(edge2Temp, edge2Temp);
            distTemp  = Avx.HorizontalAdd(distTemp, distTemp);

            // Complete all dot products for SSE ops
            Vector128 <double> uvs   = SIMDHelpers.Add2(uvTemp);
            Vector128 <double> dist  = SIMDHelpers.Add2(edge2Temp);
            Vector128 <double> temp1 = SIMDHelpers.Add2(distTemp);
            Vector128 <double> temp2;

            // vec2 constants we'll be using later
            Vector128 <double> ones2   = SIMDHelpers.BroadcastScalar2(1D);
            Vector128 <double> zeroes2 = new Vector128 <double>();

            // Reciprocal of distance along edge0to1
            temp1 = Sse2.Divide(ones2, temp1);
            temp2 = Sse2.CompareOrdered(temp1, temp1);
            // Remove NaNs from the result, replaced with 0
            Vector128 <double> distZeroed = Sse2.And(temp1, temp2);

            uvs  = Sse2.Multiply(uvs, distZeroed);
            dist = Sse2.Multiply(dist, distZeroed);

            // compare uvs < 0 and > 1, dist < 0, jump out if any of those conditions are met
            temp1 = Sse2.CompareLessThan(uvs, zeroes2);
            temp2 = Mirror ? uvs : Sse3.HorizontalAdd(uvs, uvs);
            temp2 = Sse2.CompareGreaterThan(temp2, ones2);
            temp1 = Sse2.Or(temp1, temp2);
            temp2 = Sse2.CompareLessThan(dist, zeroes2);
            temp1 = Sse2.Or(temp1, temp2);

            if (!Avx.TestZ(temp1, temp1))
            {
                return(default);
示例#23
0
文件: Game.cs 项目: jdarc/simd-bench
        protected override void Update(GameTime gameTime)
        {
            if (GamePad.GetState(PlayerIndex.One).Buttons.Back == ButtonState.Pressed ||
                Keyboard.GetState().IsKeyDown(Keys.Escape))
            {
                Exit();
            }

            var width  = _graphics.PreferredBackBufferWidth;
            var height = _graphics.PreferredBackBufferHeight;

            var ang1      = gameTime.TotalGameTime.Ticks / 9230000.0F;
            var aspect    = (float)width / height;
            var look      = Matrix4x4.CreateLookAt(new Vector3(0, 0, 60), Vector3.One, Vector3.UnitY);
            var proj      = Matrix4x4.CreatePerspectiveFieldOfView((float)(Math.PI / 3.0), aspect, 1.0F, 1000.0F);
            var rotationY = Matrix4x4.CreateRotationY(ang1);
            var rotationX = Matrix4x4.CreateRotationX(0.5F);
            var comb      = rotationY * rotationX * Matrix4x4.CreateScale(10.0F) * look * proj;
            var m0        = Vector128.Create(comb.M11, comb.M21, comb.M31, comb.M41);
            var m1        = Vector128.Create(comb.M12, comb.M22, comb.M32, comb.M42);
            var m2        = Vector128.Create(comb.M13, comb.M23, comb.M33, comb.M43);
            var m3        = Vector128.Create(comb.M14, comb.M24, comb.M34, comb.M44);

            var inv    = Vector128.Create(1.0F, -1.0F, 1.0F, 1.0F);
            var half   = Vector128.Create(0.5F);
            var screen = Vector128.Create(width, height, 0.0F, 0.0F);

            _colorRaster.Clear(-0x1000000);
            var chunks = _vertices.Length / Environment.ProcessorCount;

            Parallel.For(0, Environment.ProcessorCount, y =>
            {
                var offset = y * chunks;
                for (var l = offset; l < offset + chunks; l++)
                {
                    var vv  = _vertices[l];
                    var h0  = Sse3.HorizontalAdd(Sse.Multiply(vv, m0), Sse.Multiply(vv, m1));
                    var h1  = Sse3.HorizontalAdd(Sse.Multiply(vv, m2), Sse.Multiply(vv, m3));
                    var h3  = Sse.Multiply(inv, Sse3.HorizontalAdd(h0, h1));
                    var vv2 = Sse.Divide(h3, Vector128.Create(h3.GetElement(3)));
                    var vv4 = Sse.Multiply(screen, Sse.Multiply(half, Sse.Add(Vector128.Create(1.0F), vv2)));
                    var f   = Sse2.ConvertToVector128Int32(vv4);
                    var sx  = f.GetElement(0);
                    var sy  = f.GetElement(1);
                    _colorRaster[sx, sy] = 0xFFFFFF;
                }
            });

            base.Update(gameTime);
        }
示例#24
0
            public void RunStructFldScenario_Load(AlternatingBinaryOpTest__AddSubtractDouble testClass)
            {
                fixed(Vector128 <Double> *pFld1 = &_fld1)
                fixed(Vector128 <Double> *pFld2 = &_fld2)
                {
                    var result = Sse3.AddSubtract(
                        Sse2.LoadVector128((Double *)(pFld1)),
                        Sse2.LoadVector128((Double *)(pFld2))
                        );

                    Unsafe.Write(testClass._dataTable.outArrayPtr, result);
                    testClass.ValidateResult(_fld1, _fld2, testClass._dataTable.outArrayPtr);
                }
            }
示例#25
0
            public void RunStructFldScenario_Load(HorizontalBinaryOpTest__HorizontalAddSingle testClass)
            {
                fixed(Vector128 <Single> *pFld1 = &_fld1)
                fixed(Vector128 <Single> *pFld2 = &_fld2)
                {
                    var result = Sse3.HorizontalAdd(
                        Sse.LoadVector128((Single *)(pFld1)),
                        Sse.LoadVector128((Single *)(pFld2))
                        );

                    Unsafe.Write(testClass._dataTable.outArrayPtr, result);
                    testClass.ValidateResult(_fld1, _fld2, testClass._dataTable.outArrayPtr);
                }
            }
示例#26
0
        public static unsafe void CalculateDiagonalSection_Sse41 <T>(void *refDiag1Ptr, void *refDiag2Ptr, char *sourcePtr, char *targetPtr, ref int rowIndex, int columnIndex) where T : struct
        {
            if (typeof(T) == typeof(int))
            {
                var diag1Ptr = (int *)refDiag1Ptr;
                var diag2Ptr = (int *)refDiag2Ptr;

                var sourceVector = Sse41.ConvertToVector128Int32((ushort *)sourcePtr + rowIndex - Vector128 <T> .Count);
                var targetVector = Sse41.ConvertToVector128Int32((ushort *)targetPtr + columnIndex - 1);
                targetVector = Sse2.Shuffle(targetVector, 0x1b);
                var substitutionCostAdjustment = Sse2.CompareEqual(sourceVector, targetVector);

                var substitutionCost = Sse2.Add(
                    Sse3.LoadDquVector128(diag1Ptr + rowIndex - Vector128 <T> .Count),
                    substitutionCostAdjustment
                    );

                var deleteCost = Sse3.LoadDquVector128(diag2Ptr + rowIndex - (Vector128 <T> .Count - 1));
                var insertCost = Sse3.LoadDquVector128(diag2Ptr + rowIndex - Vector128 <T> .Count);

                var localCost = Sse41.Min(Sse41.Min(insertCost, deleteCost), substitutionCost);
                localCost = Sse2.Add(localCost, Vector128.Create(1));

                Sse2.Store(diag1Ptr + rowIndex - (Vector128 <T> .Count - 1), localCost);
            }
            else if (typeof(T) == typeof(ushort))
            {
                var diag1Ptr = (ushort *)refDiag1Ptr;
                var diag2Ptr = (ushort *)refDiag2Ptr;

                var sourceVector = Sse3.LoadDquVector128((ushort *)sourcePtr + rowIndex - Vector128 <T> .Count);
                var targetVector = Sse3.LoadDquVector128((ushort *)targetPtr + columnIndex - 1);
                targetVector = Ssse3.Shuffle(targetVector.AsByte(), REVERSE_USHORT_AS_BYTE_128).AsUInt16();
                var substitutionCostAdjustment = Sse2.CompareEqual(sourceVector, targetVector);

                var substitutionCost = Sse2.Add(
                    Sse3.LoadDquVector128(diag1Ptr + rowIndex - Vector128 <T> .Count),
                    substitutionCostAdjustment
                    );

                var deleteCost = Sse3.LoadDquVector128(diag2Ptr + rowIndex - (Vector128 <T> .Count - 1));
                var insertCost = Sse3.LoadDquVector128(diag2Ptr + rowIndex - Vector128 <T> .Count);

                var localCost = Sse41.Min(Sse41.Min(insertCost, deleteCost), substitutionCost);
                localCost = Sse2.Add(localCost, Vector128.Create((ushort)1));

                Sse2.Store(diag1Ptr + rowIndex - (Vector128 <T> .Count - 1), localCost);
            }
        }
        public void RunClsVarScenario_Load()
        {
            TestLibrary.TestFramework.BeginScenario(nameof(RunClsVarScenario_Load));

            fixed(Vector128 <Double> *pClsVar1 = &_clsVar1)
            fixed(Vector128 <Double> *pClsVar2 = &_clsVar2)
            {
                var result = Sse3.HorizontalSubtract(
                    Sse2.LoadVector128((Double *)(pClsVar1)),
                    Sse2.LoadVector128((Double *)(pClsVar2))
                    );

                Unsafe.Write(_dataTable.outArrayPtr, result);
                ValidateResult(_clsVar1, _clsVar2, _dataTable.outArrayPtr);
            }
        }
示例#28
0
        public void RunClassFldScenario_Load()
        {
            TestLibrary.TestFramework.BeginScenario(nameof(RunClassFldScenario_Load));

            fixed(Vector128 <Single> *pFld1 = &_fld1)
            fixed(Vector128 <Single> *pFld2 = &_fld2)
            {
                var result = Sse3.HorizontalAdd(
                    Sse.LoadVector128((Single *)(pFld1)),
                    Sse.LoadVector128((Single *)(pFld2))
                    );

                Unsafe.Write(_dataTable.outArrayPtr, result);
                ValidateResult(_fld1, _fld2, _dataTable.outArrayPtr);
            }
        }
        public static Vector4F DistanceSquared2D(Vector4FParam1_3 left, Vector4FParam1_3 right)
        {
            // SSE4.1 has a native dot product instruction, dpps
            if (Sse41.IsSupported)
            {
                Vector4F diff = Sse.Subtract(left, right);

                // This multiplies the first 2 elems of each and broadcasts it into each element of the returning vector
                const byte control = 0b_0011_1111;
                return(Sse41.DotProduct(diff, diff, control));
            }
            // We can use SSE to vectorize the multiplication
            // There are different fastest methods to sum the resultant vector
            // on SSE3 vs SSE1
            else if (Sse3.IsSupported)
            {
                Vector4F diff = Sse.Subtract(left, right);

                Vector4F mul = Sse.Multiply(diff, diff);

                // Set W and Z to zero
                Vector4F result = Sse.And(mul, MaskWAndZToZero);

                // Add X and Y horizontally, leaving the vector as (X+Y, Y, X+Y. ?)
                result = Sse3.HorizontalAdd(result, result);

                // MoveLowAndDuplicate makes a new vector from (X, Y, Z, W) to (X, X, Z, Z)
                return(Sse3.MoveLowAndDuplicate(result));
            }
            else if (Sse.IsSupported)
            {
                Vector4F diff = Sse.Subtract(left, right);

                Vector4F mul = Sse.Multiply(diff, diff);

                Vector4F temp = Sse.Shuffle(mul, mul, Helpers.Shuffle(1, 1, 1, 1));

                mul = Sse.AddScalar(mul, temp);

                mul = Sse.Shuffle(mul, mul, Helpers.Shuffle(0, 0, 0, 0));

                return(mul);
            }

            return(DistanceSquared2D_Software(left, right));
        }
示例#30
0
        public static VectorF Normalize2D(VectorFParam1_3 vector)
        {
            #region Manual Inline
            // SSE4.1 has a native dot product instruction, dpps
            if (Sse41.IsSupported)
            {
                // This multiplies the first 2 elems of each and broadcasts it into each element of the returning vector
                const byte control = 0b_0011_1111;
                VectorF    dp      = Sse41.DotProduct(vector, vector, control);

                return(Sse.Divide(vector, Sse.Sqrt(dp)));
            }
            // We can use SSE to vectorize the multiplication
            // There are different fastest methods to sum the resultant vector
            // on SSE3 vs SSE1
            else if (Sse3.IsSupported)
            {
                VectorF mul = Sse.Multiply(vector, vector);

                // Set W and Z to zero
                VectorF result = Sse.And(mul, MaskWAndZToZero);

                // Add X and Y horizontally, leaving the vector as (X+Y, Y, X+Y. ?)
                result = Sse3.HorizontalAdd(result, result);

                // MoveLowAndDuplicate makes a new vector from (X, Y, Z, W) to (X, X, Z, Z)
                VectorF dp = Sse3.MoveLowAndDuplicate(result);
                return(Sse.Divide(vector, Sse.Sqrt(dp)));
            }
            else if (Sse.IsSupported)
            {
                VectorF mul = Sse.Multiply(vector, vector);

                VectorF temp = Sse.Shuffle(mul, mul, Shuffle(1, 1, 1, 1));

                mul = Sse.AddScalar(mul, temp);

                mul = Sse.Shuffle(mul, mul, Shuffle(0, 0, 0, 0));

                return(Sse.Divide(vector, Sse.Sqrt(mul)));
            }
            #endregion

            return(Normalize2D_Software(vector));
        }