public static unsafe void AddTransposeXor( ref Vector128 <uint> x0, ref Vector128 <uint> x1, ref Vector128 <uint> x2, ref Vector128 <uint> x3, ref Vector128 <uint> o0, ref Vector128 <uint> o1, ref Vector128 <uint> o2, ref Vector128 <uint> o3, byte *source, byte *destination) { // x+=o x0 = Sse2.Add(x0, o0); x1 = Sse2.Add(x1, o1); x2 = Sse2.Add(x2, o2); x3 = Sse2.Add(x3, o3); // Transpose var t0 = Sse2.UnpackLow(x0, x1); var t1 = Sse2.UnpackLow(x2, x3); var t2 = Sse2.UnpackHigh(x0, x1); var t3 = Sse2.UnpackHigh(x2, x3); x0 = Sse2.UnpackLow(t0.AsUInt64(), t1.AsUInt64()).AsUInt32(); x1 = Sse2.UnpackHigh(t0.AsUInt64(), t1.AsUInt64()).AsUInt32(); x2 = Sse2.UnpackLow(t2.AsUInt64(), t3.AsUInt64()).AsUInt32(); x3 = Sse2.UnpackHigh(t2.AsUInt64(), t3.AsUInt64()).AsUInt32(); // Xor Sse2.Store(destination, Sse2.Xor(x0.AsByte(), Sse2.LoadVector128(source))); Sse2.Store(destination + 64, Sse2.Xor(x1.AsByte(), Sse2.LoadVector128(source + 64))); Sse2.Store(destination + 128, Sse2.Xor(x2.AsByte(), Sse2.LoadVector128(source + 128))); Sse2.Store(destination + 192, Sse2.Xor(x3.AsByte(), Sse2.LoadVector128(source + 192))); }
private static unsafe void OneQuad(ref Vector128 <uint> x_A, ref Vector128 <uint> x_B, ref Vector128 <uint> x_C, ref Vector128 <uint> x_D, ref Vector128 <uint> origA, ref Vector128 <uint> origB, ref Vector128 <uint> origC, ref Vector128 <uint> origD, byte *m, byte *c) { Vector128 <uint> t_A, t_B, t_C, t_D, t0, t1, t2, t3; x_A = Sse2.Add(x_A, origA); x_B = Sse2.Add(x_B, origB); x_C = Sse2.Add(x_C, origC); x_D = Sse2.Add(x_D, origD); t_A = Sse2.UnpackLow(x_A, x_B); t_B = Sse2.UnpackLow(x_C, x_D); t_C = Sse2.UnpackHigh(x_A, x_B); t_D = Sse2.UnpackHigh(x_C, x_D); x_A = Sse2.UnpackLow(t_A.AsUInt64(), t_B.AsUInt64()).AsUInt32(); x_B = Sse2.UnpackHigh(t_A.AsUInt64(), t_B.AsUInt64()).AsUInt32(); x_C = Sse2.UnpackLow(t_C.AsUInt64(), t_D.AsUInt64()).AsUInt32(); x_D = Sse2.UnpackHigh(t_C.AsUInt64(), t_D.AsUInt64()).AsUInt32(); t0 = Sse2.Xor(x_A.AsByte(), Sse2.LoadVector128(m)).AsUInt32(); Sse2.Store(c, t0.AsByte()); t1 = Sse2.Xor(x_B.AsByte(), Sse2.LoadVector128(m + 64)).AsUInt32(); Sse2.Store(c + 64, t1.AsByte()); t2 = Sse2.Xor(x_C.AsByte(), Sse2.LoadVector128(m + 128)).AsUInt32(); Sse2.Store(c + 128, t2.AsByte()); t3 = Sse2.Xor(x_D.AsByte(), Sse2.LoadVector128(m + 192)).AsUInt32(); Sse2.Store(c + 192, t3.AsByte()); }
public static Vector128 <T> And <T>(Vector128 <T> left, Vector128 <T> right) where T : struct { if (typeof(T) == typeof(float)) { if (Sse.IsSupported) { return(Sse.And(left.AsSingle(), right.AsSingle()).As <float, T>()); } } if (typeof(T) == typeof(double)) { if (Sse2.IsSupported) { return(Sse2.And(left.AsDouble(), right.AsDouble()).As <double, T>()); } if (Sse.IsSupported) { return(Sse.And(left.AsSingle(), right.AsSingle()).As <float, T>()); } } if (Sse2.IsSupported) { return(Sse2.And(left.AsByte(), right.AsByte()).As <byte, T>()); } if (Sse.IsSupported) { return(Sse.And(left.AsSingle(), right.AsSingle()).As <float, T>()); } return(SoftwareFallbacks.And_Software(left, right)); }
public override unsafe int FindFirstCharacterToEncode(char *text, int textLength) { if (text == null) { throw new ArgumentNullException(nameof(text)); } int idx = 0; #if NETCOREAPP if (Sse2.IsSupported) { short *startingAddress = (short *)text; while (textLength - 8 >= idx) { Debug.Assert(startingAddress >= text && startingAddress <= (text + textLength - 8)); // Load the next 8 characters. Vector128 <short> sourceValue = Sse2.LoadVector128(startingAddress); // Check if any of the 8 characters need to be escaped. Vector128 <short> mask = Sse2Helper.CreateEscapingMask_DefaultJavaScriptEncoderBasicLatin(sourceValue); int index = Sse2.MoveMask(mask.AsByte()); // If index == 0, that means none of the 8 characters needed to be escaped. // TrailingZeroCount is relatively expensive, avoid it if possible. if (index != 0) { // Found at least one character that needs to be escaped, figure out the index of // the first one found that needed to be escaped within the 8 characters. Debug.Assert(index > 0 && index <= 65_535); int tzc = BitOperations.TrailingZeroCount(index); Debug.Assert(tzc % 2 == 0 && tzc >= 0 && tzc <= 16); idx += tzc >> 1; goto Return; } idx += 8; startingAddress += 8; } // Process the remaining characters. Debug.Assert(textLength - idx < 8); } #endif for (; idx < textLength; idx++) { Debug.Assert((text + idx) <= (text + textLength)); if (NeedsEscaping(*(text + idx))) { goto Return; } } idx = -1; // All characters are allowed. Return: return(idx); }
public static bool MyEquals(ref Vector128 <int> left, Vector128 <int> right) { if (Sse2.IsSupported) { Vector128 <byte> result = MyCompareEqual(left.AsByte(), right.AsByte()); return(Sse2.MoveMask(result) == 0b1111_1111_1111_1111); // We have one bit per element } return(true); }
static void ShiftRight128(Vector128 <ulong> initial, uint n, out Vector128 <ulong> outLeft, out Vector128 <ulong> outRight) { uint maskPos = 16 - n; Vector128 <byte> maskA = Vector128.Create(_shuffleMasks[maskPos], _shuffleMasks[maskPos + 1], _shuffleMasks[maskPos + 2], _shuffleMasks[maskPos + 3], _shuffleMasks[maskPos + 4], _shuffleMasks[maskPos + 5], _shuffleMasks[maskPos + 6], _shuffleMasks[maskPos + 7], _shuffleMasks[maskPos + 8], _shuffleMasks[maskPos + 9], _shuffleMasks[maskPos + 10], _shuffleMasks[maskPos + 11], _shuffleMasks[maskPos + 12], _shuffleMasks[maskPos + 13], _shuffleMasks[maskPos + 14], _shuffleMasks[maskPos + 15]); Vector128 <byte> maskB = Sse2.Xor(maskA, Sse2.CompareEqual(Vector128 <byte> .Zero, Vector128 <byte> .Zero)); outLeft = Ssse3.Shuffle(initial.AsByte(), maskB).AsUInt64(); outRight = Ssse3.Shuffle(initial.AsByte(), maskA).AsUInt64(); }
private unsafe ulong HashSse(byte *buf, int len) { ulong h = 0; Vector128 <int> v_ps = Vector128 <int> .Zero; bool useSse4 = Sse41.IsSupported; int i = 0; for (int j = len - i - 1; len - i >= 4; i += 4, j = len - i - 1) { Vector128 <int> c_v = Sse2.LoadVector128(&kMultFactorsPtr[j - 3]); c_v = Sse2.Shuffle(c_v, SO123); Vector128 <byte> q_v = Sse2.LoadVector128(buf + i); Vector128 <int> s_v; if (useSse4) { s_v = Sse41.ConvertToVector128Int32(q_v); } else { q_v = Sse2.UnpackLow(q_v, q_v); s_v = Sse2.ShiftRightLogical(Sse2.UnpackLow(q_v.AsUInt16(), q_v.AsUInt16()).AsInt32(), 24); } if (useSse4) { v_ps = Sse2.Add(v_ps, Sse41.MultiplyLow(c_v, s_v)); } else { Vector128 <ulong> v_tmp1 = Sse2.Multiply(c_v.AsUInt32(), s_v.AsUInt32()); Vector128 <ulong> v_tmp2 = Sse2.Multiply(Sse2.ShiftRightLogical128BitLane(c_v.AsByte(), 4).AsUInt32(), Sse2.ShiftRightLogical128BitLane(s_v.AsByte(), 4).AsUInt32()); ; v_ps = Sse2.Add(v_ps, Sse2.UnpackLow(Sse2.Shuffle(v_tmp1.AsInt32(), SOO2O), Sse2.Shuffle(v_tmp2.AsInt32(), SOO2O))); } } v_ps = Sse2.Add(v_ps, Sse2.Shuffle(v_ps, S23O1)); v_ps = Sse2.Add(v_ps, Sse2.Shuffle(v_ps, S1O32)); h += Sse2.ConvertToUInt32(v_ps.AsUInt32()); for (; i < len; i++) { int index = len - i - 1; ulong c = (uint)kMultFactors[index]; h += c * buf[i]; } return(h & (kBase - 1)); }
private static uint32_t parse_eight_digits_unrolled(bytechar *chars) { // this actually computes *16* values so we are being wasteful. Vector128 <sbyte> ascii0 = Vector128.Create((bytechar)'0'); Vector128 <sbyte> input = Sse2.Subtract(Sse2.LoadVector128(chars), ascii0); Vector128 <short> t1 = Ssse3.MultiplyAddAdjacent(input.AsByte(), mul_1_10); Vector128 <int> t2 = Sse2.MultiplyAddAdjacent(t1, mul_1_100); Vector128 <ushort> t3 = Sse41.PackUnsignedSaturate(t2, t2); Vector128 <int> t4 = Sse2.MultiplyAddAdjacent(t3.AsInt16(), mul_1_10000); return(Sse2.ConvertToUInt32(t4.AsUInt32())); // only captures the sum of the first 8 digits, drop the rest }
public static Vector128 <T> Vector128Add <T>(Vector128 <T> left, Vector128 <T> right) where T : struct { if (typeof(T) == typeof(byte)) { return(Sse2.Add(left.AsByte(), right.AsByte()).As <byte, T>()); } else if (typeof(T) == typeof(sbyte)) { return(Sse2.Add(left.AsSByte(), right.AsSByte()).As <sbyte, T>()); } else if (typeof(T) == typeof(short)) { return(Sse2.Add(left.AsInt16(), right.AsInt16()).As <short, T>()); } else if (typeof(T) == typeof(ushort)) { return(Sse2.Add(left.AsUInt16(), right.AsUInt16()).As <ushort, T>()); } else if (typeof(T) == typeof(int)) { return(Sse2.Add(left.AsInt32(), right.AsInt32()).As <int, T>()); } else if (typeof(T) == typeof(uint)) { return(Sse2.Add(left.AsUInt32(), right.AsUInt32()).As <uint, T>()); } else if (typeof(T) == typeof(long)) { return(Sse2.Add(left.AsInt64(), right.AsInt64()).As <long, T>()); } else if (typeof(T) == typeof(ulong)) { return(Sse2.Add(left.AsUInt64(), right.AsUInt64()).As <ulong, T>()); } else if (typeof(T) == typeof(float)) { return(Sse.Add(left.AsSingle(), right.AsSingle()).As <float, T>()); } else if (typeof(T) == typeof(double)) { return(Sse2.Add(left.AsDouble(), right.AsDouble()).As <double, T>()); } else { throw new NotSupportedException(); } }
public static Vector128 <float> ConditionalSelectBitwise(Vector128 <float> selector, Vector128 <float> ifTrue, Vector128 <float> ifFalse) { Debug.Assert(Sse.IsSupported || AdvSimd.IsSupported); if (Sse.IsSupported) { return(Sse.Or( Sse.And(ifTrue, selector), Sse.AndNot(selector, ifFalse) )); } else if (AdvSimd.IsSupported) { return(AdvSimd.BitwiseSelect(selector.AsByte(), ifTrue.AsByte(), ifFalse.AsByte()).As <byte, float>()); } return(default);
public static Vector128 <T> Select <T, U>(Vector128 <T> left, Vector128 <T> right, Vector128 <U> selector) where T : struct where U : struct { if (Sse41.IsSupported) { if (typeof(T) == typeof(float)) { return(Sse41.BlendVariable(left.AsSingle(), right.AsSingle(), selector.AsSingle()).As <float, T>()); } else if (typeof(T) == typeof(double)) { return(Sse41.BlendVariable(left.AsDouble(), right.AsDouble(), selector.AsDouble()).As <double, T>()); } return(Sse41.BlendVariable(left.AsByte(), right.AsByte(), selector.AsByte()).As <byte, T>()); } return(Or(And(selector.As <U, T>(), right), AndNot(selector.As <U, T>(), left))); }
public static Vector128 <T> ReverseEndianness32 <T>(this Vector128 <T> value) where T : struct { return(Ssse3.Shuffle(value.AsByte(), Reverse32).As <byte, T>()); }
/*public void ResizeBilinear2(FastBitmap rtnImage) * { * * float scaleX = (float)this.width / rtnImage.width; * float scaleY = (float)this.height / rtnImage.height; * if (scaleX > 1 || scaleY > 1) * { * ResizeBilinear(rtnImage); * return; * } * * byte[] tmp = new byte[4 * (this.height + 1) * (rtnImage.width)]; * * fixed (byte* p = tmp) * { * byte* tmpp = p; * * Parallel.For(0, this.height, (y) => * { * var _00mask = Vector128.Create(0, 255, 255, 255, 1, 255, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255); * var _01mask = Vector128.Create(4, 255, 255, 255, 5, 255, 255, 255, 6, 255, 255, 255, 7, 255, 255, 255); * var _vmask = Vector128.Create(0, 4, 8, 12, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255); * * uint* store = stackalloc uint[4]; * * uint* pos = (uint*)(this._ptr + (this._stride * y)); * uint* rtnPos = (uint*)(tmpp + (rtnImage._stride * y)); * for (int x = 0; x < rtnImage.width; x++) * { * float px = scaleX * x; * int x0 = (int)px; * int x1 = x0 + 1; * float rx = px - x0; * * var rxv = Vector128.Create(rx, rx, rx, rx); * * var _ = Avx2.GatherVector128(pos, Vector128.Create(x0, x1, 0, 0), 4); * var _b = Vector128.AsByte(_); * var _00 = Sse2.ConvertToVector128Single(Ssse3.Shuffle(_b, _00mask).AsInt32()); * var _01 = Sse2.ConvertToVector128Single(Ssse3.Shuffle(_b, _01mask).AsInt32()); * var vf = Sse.Add(_00, Sse.Multiply(Sse.Subtract(_01, _00), rxv)); * var vb = Sse2.ConvertToVector128Int32WithTruncation(vf).AsByte(); * var v = Ssse3.Shuffle(vb, _vmask).AsUInt32(); * Sse2.Store(store, v); * rtnPos = *store; * rtnPos++; * } * }); * Parallel.For(0, rtnImage.height, (y) => * // for (int y = 0; y < rtnImage.height; y++) * * { * float py = scaleY * y; * int y0 = (int)py; * int y1 = y0 + 1; * * float ry = py - y0; * * uint* pos = (uint*)(tmpp + rtnImage.width * 4 * y0); * int offset = rtnImage.width; * var ryv = Vector128.Create(ry, ry, ry, ry); * * var _00mask = Vector128.Create(0, 255, 255, 255, 1, 255, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255); * var _01mask = Vector128.Create(4, 255, 255, 255, 5, 255, 255, 255, 6, 255, 255, 255, 7, 255, 255, 255); * var _vmask = Vector128.Create(0, 4, 8, 12, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255); * uint* rtnPos = (uint*)(rtnImage._ptr + (rtnImage._stride * y)); * uint* store = stackalloc uint[4]; * * for (int x = 0; x < rtnImage.width; x++) * { * * var _ = Avx2.GatherVector128(pos, Vector128.Create(0, offset, 0, 0), 4); * var _b = Vector128.AsByte(_); * var _00 = Sse2.ConvertToVector128Single(Ssse3.Shuffle(_b, _00mask).AsInt32()); * var _01 = Sse2.ConvertToVector128Single(Ssse3.Shuffle(_b, _01mask).AsInt32()); * var vf = Sse.Add(_00, Sse.Multiply(Sse.Subtract(_01, _00), ryv)); * var vb = Sse2.ConvertToVector128Int32WithTruncation(vf).AsByte(); * var v = Ssse3.Shuffle(vb, _vmask).AsUInt32(); * Sse2.Store(store, v); * rtnPos = *store; * rtnPos++; * pos++; * // *rtnPos = *pos; * // rtnPos++; * // pos++; * // byte* _00 = tmpp + (rtnImage.width * 4 * y0) + x * 4; * // byte* _10 = tmpp + (rtnImage.width * 4 * y1) + x * 4; * // * // uint value = 0; * // ((byte*)(&value))[0] = (byte)(_00[0] + (_10[0] - _00[0]) * ry); * // ((byte*)(&value))[1] = (byte)(_00[1] + (_10[1] - _00[1]) * ry); * // ((byte*)(&value))[2] = (byte)(_00[2] + (_10[2] - _00[2]) * ry); * // ((byte*)(&value))[3] = (byte)(_00[3] + (_10[3] - _00[3]) * ry); * // * // *(uint*)(rtnImage._ptr + (rtnImage._stride * y) + (x * 4)) = value; * * } * }); * } * }*/ //int y0 = (int)py; //int y1 = y0 + 1; // //int x0 = (int)px; //int x1 = x0 + 1; // //float ry = py - y0; //float rx = px - x0; // //byte* _00 = this._ptr + (this._stride * y0) + (x0 * 4); //byte* _01 = this._ptr + (this._stride * y0) + (x1 * 4); //byte* _10 = this._ptr + (this._stride * y1) + (x0 * 4); //byte* _11 = this._ptr + (this._stride * y1) + (x1 * 4); // // //uint _y0u = 0; //uint _y1u = 0; //byte* _y0 = (byte*)&_y0u; //byte* _y1 = (byte*)&_y1u; // //_y0[0] = (byte)(_00[0] + (_10[0] - _00[0]) * ry); //_y0[1] = (byte)(_00[1] + (_10[1] - _00[1]) * ry); //_y0[2] = (byte)(_00[2] + (_10[2] - _00[2]) * ry); //_y0[3] = (byte)(_00[3] + (_10[3] - _00[3]) * ry); // //_y1[0] = (byte)(_01[0] + (_11[0] - _01[0]) * ry); //_y1[1] = (byte)(_01[1] + (_11[1] - _01[1]) * ry); //_y1[2] = (byte)(_01[2] + (_11[2] - _01[2]) * ry); //_y1[3] = (byte)(_01[3] + (_11[3] - _01[3]) * ry); // //uint value = 0; //((byte*)(&value))[0] = (byte)(_y0[0] + (_y1[0] - _y0[0]) * rx); //((byte*)(&value))[1] = (byte)(_y0[1] + (_y1[1] - _y0[1]) * rx); //((byte*)(&value))[2] = (byte)(_y0[2] + (_y1[2] - _y0[2]) * rx); //((byte*)(&value))[3] = (byte)(_y0[3] + (_y1[3] - _y0[3]) * rx); // //*(uint*)(rtnImage._ptr + (rtnImage._stride * y) + (x * 4)) = value; public void ResizeBilinear(FastBitmap rtnImage) { float scaleX = (float)this.width / rtnImage.width; float scaleY = (float)this.height / rtnImage.height; var _00mask = Vector128.Create(0, 255, 255, 255, 1, 255, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255); var _01mask = Vector128.Create(4, 255, 255, 255, 5, 255, 255, 255, 6, 255, 255, 255, 7, 255, 255, 255); var _10mask = Vector128.Create(8, 255, 255, 255, 9, 255, 255, 255, 10, 255, 255, 255, 11, 255, 255, 255); var _11mask = Vector128.Create(12, 255, 255, 255, 13, 255, 255, 255, 14, 255, 255, 255, 15, 255, 255, 255); var _vmask = Vector128.Create(0, 4, 8, 12, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255); Parallel.For(0, rtnImage.height, (y) => { float py = scaleY * y; int y0 = (int)py; int y1 = y0 + 1; float ry = py - y0; var ryv = Vector128.Create(ry, ry, ry, ry); uint *py0 = (uint *)(this._ptr + (this._stride * y0)); uint *py1 = (uint *)(this._ptr + (this._stride * y1)); int dy = (int)(py1 - py0); uint *rtnPos = (uint *)(rtnImage._ptr + (rtnImage._stride * y)); uint *store = stackalloc uint[4]; for (int x = 0; x < rtnImage.width; x++) { float px = scaleX * x; int x0 = (int)px; int x1 = x0 + 1; float rx = px - x0; var rxv = Vector128.Create(rx, rx, rx, rx); var _ = Avx2.GatherVector128(py0, Vector128.Create(x0, x1, x0 + dy, x1 + dy), 4); var _b = Vector128.AsByte(_); var _00 = Sse2.ConvertToVector128Single(Ssse3.Shuffle(_b, _00mask).AsInt32()); var _01 = Sse2.ConvertToVector128Single(Ssse3.Shuffle(_b, _01mask).AsInt32()); var _10 = Sse2.ConvertToVector128Single(Ssse3.Shuffle(_b, _10mask).AsInt32()); var _11 = Sse2.ConvertToVector128Single(Ssse3.Shuffle(_b, _11mask).AsInt32()); var _y0 = Sse.Add(_00, Sse.Multiply(Sse.Subtract(_10, _00), ryv)); var _y1 = Sse.Add(_01, Sse.Multiply(Sse.Subtract(_11, _01), ryv)); var vf = Sse.Add(_y0, Sse.Multiply(Sse.Subtract(_y1, _y0), rxv)); var vb = Sse2.ConvertToVector128Int32WithTruncation(vf).AsByte(); var v = Ssse3.Shuffle(vb, _vmask).AsUInt32(); Sse2.Store(store, v); *rtnPos = *store; rtnPos++; _00 = _10; _01 = _11; } }); }
public static Vector128 <T> RotateLeftUInt32_24 <T>(this Vector128 <T> value) where T : struct { return(Ssse3.IsSupported ? Ssse3.Shuffle(value.AsByte(), Rot24).As <byte, T>() : value.RotateLeftUInt32(24)); }
private unsafe static Surface ReadNv12(ResourceManager rm, ref SlotSurfaceConfig config, ref PlaneOffsets offsets) { InputSurface input = ReadSurface(rm.Gmm, ref config, ref offsets, 1, 2); int width = input.Width; int height = input.Height; int yStride = GetPitch(width, 1); int uvStride = GetPitch(input.UvWidth, 2); Surface output = new Surface(rm.SurfacePool, width, height); if (Sse41.IsSupported) { Vector128 <byte> shufMask = Vector128.Create( (byte)0, (byte)2, (byte)3, (byte)1, (byte)4, (byte)6, (byte)7, (byte)5, (byte)8, (byte)10, (byte)11, (byte)9, (byte)12, (byte)14, (byte)15, (byte)13); Vector128 <short> alphaMask = Vector128.Create(0xffUL << 48).AsInt16(); int yStrideGap = yStride - width; int uvStrideGap = uvStride - input.UvWidth; int widthTrunc = width & ~0xf; fixed(Pixel *dstPtr = output.Data) { Pixel *op = dstPtr; fixed(byte *src0Ptr = input.Buffer0, src1Ptr = input.Buffer1) { byte *i0p = src0Ptr; for (int y = 0; y < height; y++) { byte *i1p = src1Ptr + (y >> 1) * uvStride; int x = 0; for (; x < widthTrunc; x += 16, i0p += 16, i1p += 16) { Vector128 <short> ya0 = Sse41.ConvertToVector128Int16(i0p); Vector128 <short> ya1 = Sse41.ConvertToVector128Int16(i0p + 8); Vector128 <byte> uv = Sse2.LoadVector128(i1p); Vector128 <short> uv0 = Sse2.UnpackLow(uv.AsInt16(), uv.AsInt16()); Vector128 <short> uv1 = Sse2.UnpackHigh(uv.AsInt16(), uv.AsInt16()); Vector128 <short> rgba0 = Sse2.UnpackLow(ya0, uv0); Vector128 <short> rgba1 = Sse2.UnpackHigh(ya0, uv0); Vector128 <short> rgba2 = Sse2.UnpackLow(ya1, uv1); Vector128 <short> rgba3 = Sse2.UnpackHigh(ya1, uv1); rgba0 = Ssse3.Shuffle(rgba0.AsByte(), shufMask).AsInt16(); rgba1 = Ssse3.Shuffle(rgba1.AsByte(), shufMask).AsInt16(); rgba2 = Ssse3.Shuffle(rgba2.AsByte(), shufMask).AsInt16(); rgba3 = Ssse3.Shuffle(rgba3.AsByte(), shufMask).AsInt16(); Vector128 <short> rgba16_0 = Sse41.ConvertToVector128Int16(rgba0.AsByte()); Vector128 <short> rgba16_1 = Sse41.ConvertToVector128Int16(HighToLow(rgba0.AsByte())); Vector128 <short> rgba16_2 = Sse41.ConvertToVector128Int16(rgba1.AsByte()); Vector128 <short> rgba16_3 = Sse41.ConvertToVector128Int16(HighToLow(rgba1.AsByte())); Vector128 <short> rgba16_4 = Sse41.ConvertToVector128Int16(rgba2.AsByte()); Vector128 <short> rgba16_5 = Sse41.ConvertToVector128Int16(HighToLow(rgba2.AsByte())); Vector128 <short> rgba16_6 = Sse41.ConvertToVector128Int16(rgba3.AsByte()); Vector128 <short> rgba16_7 = Sse41.ConvertToVector128Int16(HighToLow(rgba3.AsByte())); rgba16_0 = Sse2.Or(rgba16_0, alphaMask); rgba16_1 = Sse2.Or(rgba16_1, alphaMask); rgba16_2 = Sse2.Or(rgba16_2, alphaMask); rgba16_3 = Sse2.Or(rgba16_3, alphaMask); rgba16_4 = Sse2.Or(rgba16_4, alphaMask); rgba16_5 = Sse2.Or(rgba16_5, alphaMask); rgba16_6 = Sse2.Or(rgba16_6, alphaMask); rgba16_7 = Sse2.Or(rgba16_7, alphaMask); rgba16_0 = Sse2.ShiftLeftLogical(rgba16_0, 2); rgba16_1 = Sse2.ShiftLeftLogical(rgba16_1, 2); rgba16_2 = Sse2.ShiftLeftLogical(rgba16_2, 2); rgba16_3 = Sse2.ShiftLeftLogical(rgba16_3, 2); rgba16_4 = Sse2.ShiftLeftLogical(rgba16_4, 2); rgba16_5 = Sse2.ShiftLeftLogical(rgba16_5, 2); rgba16_6 = Sse2.ShiftLeftLogical(rgba16_6, 2); rgba16_7 = Sse2.ShiftLeftLogical(rgba16_7, 2); Sse2.Store((short *)(op + (uint)x + 0), rgba16_0); Sse2.Store((short *)(op + (uint)x + 2), rgba16_1); Sse2.Store((short *)(op + (uint)x + 4), rgba16_2); Sse2.Store((short *)(op + (uint)x + 6), rgba16_3); Sse2.Store((short *)(op + (uint)x + 8), rgba16_4); Sse2.Store((short *)(op + (uint)x + 10), rgba16_5); Sse2.Store((short *)(op + (uint)x + 12), rgba16_6); Sse2.Store((short *)(op + (uint)x + 14), rgba16_7); } for (; x < width; x++, i1p += (x & 1) * 2) { Pixel *px = op + (uint)x; px->R = Upsample(*i0p++); px->G = Upsample(*i1p); px->B = Upsample(*(i1p + 1)); px->A = 0x3ff; } op += width; i0p += yStrideGap; i1p += uvStrideGap; } } } } else { for (int y = 0; y < height; y++) { int uvBase = (y >> 1) * uvStride; for (int x = 0; x < width; x++) { output.SetR(x, y, Upsample(input.Buffer0[y * yStride + x])); int uvOffs = uvBase + (x & ~1); output.SetG(x, y, Upsample(input.Buffer1[uvOffs])); output.SetB(x, y, Upsample(input.Buffer1[uvOffs + 1])); output.SetA(x, y, 0x3ff); } } } return(output); }
internal static unsafe void ProcessTextureSse2(Span <Color8> data) { uint registerElements = (uint)Vector128 <uint> .Count; registerElements.AssertEqual((uint)(sizeof(Vector128 <uint>) / sizeof(Color8))); uint offset; fixed(Color8 *dataPtr8 = data) { uint *dataPtr = (uint *)dataPtr8; for (offset = 0; offset + (registerElements - 1U) < data.Length; offset += registerElements) { Vector128 <uint> rawColor = Sse2.LoadVector128(dataPtr + offset); Vector128 <uint> alphaMask = Vector128.Create(0xFF000000U); Vector128 <uint> alpha = Sse2.And(rawColor, alphaMask); Vector128 <ushort> lo = Sse2.UnpackLow(rawColor.AsByte(), Vector128 <byte> .Zero).AsUInt16(); Vector128 <ushort> hi = Sse2.UnpackHigh(rawColor.AsByte(), Vector128 <byte> .Zero).AsUInt16(); Vector128 <uint> alphaLo, alphaHi; if (Ssse3.IsSupported) { Vector128 <byte> alphaShuffle = Vector128.Create(6, 0xFF, 6, 0xFF, 6, 0xFF, 6, 0xFF, 14, 0xFF, 14, 0xFF, 14, 0xFF, 14, 0xFF); alphaLo = Ssse3.Shuffle(lo.AsByte(), alphaShuffle).AsUInt32(); alphaHi = Ssse3.Shuffle(hi.AsByte(), alphaShuffle).AsUInt32(); } else { alphaLo = Sse2.UnpackLow(alpha.AsByte(), Vector128 <byte> .Zero).AsUInt32(); alphaHi = Sse2.UnpackHigh(alpha.AsByte(), Vector128 <byte> .Zero).AsUInt32(); Vector128 <uint> alphaLo16 = Sse2.ShiftRightLogical(alphaLo, 16); Vector128 <uint> alphaHi16 = Sse2.ShiftRightLogical(alphaHi, 16); alphaLo = Sse2.Or(alphaLo, alphaLo16); alphaHi = Sse2.Or(alphaHi, alphaHi16); Vector128 <ulong> alphaLo32 = Sse2.ShiftRightLogical(alphaLo.AsUInt64(), 32); Vector128 <ulong> alphaHi32 = Sse2.ShiftRightLogical(alphaHi.AsUInt64(), 32); alphaLo = Sse2.Or(alphaLo.AsUInt64(), alphaLo32).AsUInt32(); alphaHi = Sse2.Or(alphaHi.AsUInt64(), alphaHi32).AsUInt32(); } Vector128 <ushort> prodLo = Sse2.MultiplyLow(lo, alphaLo.AsUInt16()); Vector128 <ushort> prodHi = Sse2.MultiplyLow(hi, alphaHi.AsUInt16()); Vector128 <ushort> addend = Vector128.Create((ushort)0x00FFU); var sumLo = Sse2.Add(prodLo, addend); var sumHi = Sse2.Add(prodHi, addend); var shiftLo = Sse2.ShiftRightLogical(sumLo, 8); var shiftHi = Sse2.ShiftRightLogical(sumHi, 8); var packed = Sse2.PackUnsignedSaturate(shiftLo.AsInt16(), shiftHi.AsInt16()).AsUInt32(); var mask = Vector128.Create(0x00FFFFFFU); packed = Sse2.And(packed, mask); packed = Sse2.Or(packed, alpha); Sse2.Store(dataPtr + offset, packed); } } // This is unlikely to happen, but handle when there are still elements left (the texture size isn't aligned to 4) if (offset < data.Length) { ProcessTextureScalar(data.SliceUnsafe(offset)); } }
internal static void Step(ref ushort sum1, ref ushort sum2, byte[] buf, uint len) { uint s1 = sum1; uint s2 = sum2; int bufPos = 0; /* * Process the data in blocks. */ uint BLOCK_SIZE = 1 << 5; uint blocks = len / BLOCK_SIZE; len -= blocks * BLOCK_SIZE; while (blocks != 0) { uint n = Adler32Context.NMAX / BLOCK_SIZE; /* The NMAX constraint. */ if (n > blocks) { n = blocks; } blocks -= n; Vector128 <byte> tap1 = Vector128.Create(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17). AsByte(); Vector128 <byte> tap2 = Vector128.Create(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1).AsByte(); Vector128 <byte> zero = Vector128.Create(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0).AsByte(); Vector128 <short> ones = Vector128.Create(1, 1, 1, 1, 1, 1, 1, 1); /* * Process n blocks of data. At most NMAX data bytes can be * processed before s2 must be reduced modulo BASE. */ Vector128 <uint> v_ps = Vector128.Create(s1 * n, 0, 0, 0); Vector128 <uint> v_s2 = Vector128.Create(s2, 0, 0, 0); Vector128 <uint> v_s1 = Vector128.Create(0u, 0, 0, 0); do { /* * Load 32 input bytes. */ Vector128 <uint> bytes1 = Vector128.Create(BitConverter.ToUInt32(buf, bufPos), BitConverter.ToUInt32(buf, bufPos + 4), BitConverter.ToUInt32(buf, bufPos + 8), BitConverter.ToUInt32(buf, bufPos + 12)); bufPos += 16; Vector128 <uint> bytes2 = Vector128.Create(BitConverter.ToUInt32(buf, bufPos), BitConverter.ToUInt32(buf, bufPos + 4), BitConverter.ToUInt32(buf, bufPos + 8), BitConverter.ToUInt32(buf, bufPos + 12)); bufPos += 16; /* * Add previous block byte sum to v_ps. */ v_ps = Sse2.Add(v_ps, v_s1); /* * Horizontally add the bytes for s1, multiply-adds the * bytes by [ 32, 31, 30, ... ] for s2. */ v_s1 = Sse2.Add(v_s1, Sse2.SumAbsoluteDifferences(bytes1.AsByte(), zero).AsUInt32()); Vector128 <short> mad1 = System.Runtime.Intrinsics.X86.Ssse3.MultiplyAddAdjacent(bytes1.AsByte(), tap1.AsSByte()); v_s2 = Sse2.Add(v_s2, Sse2.MultiplyAddAdjacent(mad1.AsInt16(), ones.AsInt16()).AsUInt32()); v_s1 = Sse2.Add(v_s1, Sse2.SumAbsoluteDifferences(bytes2.AsByte(), zero).AsUInt32()); Vector128 <short> mad2 = System.Runtime.Intrinsics.X86.Ssse3.MultiplyAddAdjacent(bytes2.AsByte(), tap2.AsSByte()); v_s2 = Sse2.Add(v_s2, Sse2.MultiplyAddAdjacent(mad2.AsInt16(), ones.AsInt16()).AsUInt32()); } while(--n != 0); v_s2 = Sse2.Add(v_s2, Sse2.ShiftLeftLogical(v_ps, 5)); /* * Sum epi32 ints v_s1(s2) and accumulate in s1(s2). */ v_s1 = Sse2.Add(v_s1, Sse2.Shuffle(v_s1, 177)); v_s1 = Sse2.Add(v_s1, Sse2.Shuffle(v_s1, 78)); s1 += (uint)Sse2.ConvertToInt32(v_s1.AsInt32()); v_s2 = Sse2.Add(v_s2, Sse2.Shuffle(v_s2, 177)); v_s2 = Sse2.Add(v_s2, Sse2.Shuffle(v_s2, 78)); s2 = (uint)Sse2.ConvertToInt32(v_s2.AsInt32()); /* * Reduce. */ s1 %= Adler32Context.ADLER_MODULE; s2 %= Adler32Context.ADLER_MODULE; } /* * Handle leftover data. */ if (len != 0) { if (len >= 16) { s2 += s1 += buf[bufPos++]; s2 += s1 += buf[bufPos++]; s2 += s1 += buf[bufPos++]; s2 += s1 += buf[bufPos++]; s2 += s1 += buf[bufPos++]; s2 += s1 += buf[bufPos++]; s2 += s1 += buf[bufPos++]; s2 += s1 += buf[bufPos++]; s2 += s1 += buf[bufPos++]; s2 += s1 += buf[bufPos++]; s2 += s1 += buf[bufPos++]; s2 += s1 += buf[bufPos++]; s2 += s1 += buf[bufPos++]; s2 += s1 += buf[bufPos++]; s2 += s1 += buf[bufPos++]; s2 += s1 += buf[bufPos++]; len -= 16; } while (len-- != 0) { s2 += s1 += buf[bufPos++]; } if (s1 >= Adler32Context.ADLER_MODULE) { s1 -= Adler32Context.ADLER_MODULE; } s2 %= Adler32Context.ADLER_MODULE; } /* * Return the recombined sums. */ sum1 = (ushort)(s1 & 0xFFFF); sum2 = (ushort)(s2 & 0xFFFF); }
public static Vector128 <byte> ReverseEndianness32(this Vector128 <uint> value) { return(Ssse3.Shuffle(value.AsByte(), Reverse32)); }
internal static unsafe void ProcessTextureSse2Unrolled(Span <Color8> data) { uint registerElements = (uint)Vector128 <uint> .Count * 4U; registerElements.AssertEqual((uint)(sizeof(Vector128 <uint>) / sizeof(Color8))); uint offset; fixed(Color8 *dataPtr8 = data) { uint *dataPtr = (uint *)dataPtr8; for (offset = 0; offset + (registerElements - 1U) < data.Length; offset += registerElements) { Vector128 <uint> rawColor0 = Sse2.LoadVector128(dataPtr + offset + 0x0); Vector128 <uint> rawColor1 = Sse2.LoadVector128(dataPtr + offset + 0x4); Vector128 <uint> rawColor2 = Sse2.LoadVector128(dataPtr + offset + 0x8); Vector128 <uint> rawColor3 = Sse2.LoadVector128(dataPtr + offset + 0xC); Vector128 <uint> alphaMask = Vector128.Create(0xFF000000U); Vector128 <uint> alpha0 = Sse2.And(rawColor0, alphaMask); Vector128 <uint> alpha1 = Sse2.And(rawColor1, alphaMask); Vector128 <uint> alpha2 = Sse2.And(rawColor2, alphaMask); Vector128 <uint> alpha3 = Sse2.And(rawColor3, alphaMask); Vector128 <ushort> lo0 = Sse2.UnpackLow(rawColor0.AsByte(), Vector128 <byte> .Zero).AsUInt16(); Vector128 <ushort> lo1 = Sse2.UnpackLow(rawColor1.AsByte(), Vector128 <byte> .Zero).AsUInt16(); Vector128 <ushort> lo2 = Sse2.UnpackLow(rawColor2.AsByte(), Vector128 <byte> .Zero).AsUInt16(); Vector128 <ushort> lo3 = Sse2.UnpackLow(rawColor3.AsByte(), Vector128 <byte> .Zero).AsUInt16(); Vector128 <ushort> hi0 = Sse2.UnpackHigh(rawColor0.AsByte(), Vector128 <byte> .Zero).AsUInt16(); Vector128 <ushort> hi1 = Sse2.UnpackHigh(rawColor1.AsByte(), Vector128 <byte> .Zero).AsUInt16(); Vector128 <ushort> hi2 = Sse2.UnpackHigh(rawColor2.AsByte(), Vector128 <byte> .Zero).AsUInt16(); Vector128 <ushort> hi3 = Sse2.UnpackHigh(rawColor3.AsByte(), Vector128 <byte> .Zero).AsUInt16(); Vector128 <uint> alphaLo0, alphaHi0; Vector128 <uint> alphaLo1, alphaHi1; Vector128 <uint> alphaLo2, alphaHi2; Vector128 <uint> alphaLo3, alphaHi3; if (Ssse3.IsSupported) { Vector128 <byte> alphaShuffle = Vector128.Create(6, 0xFF, 6, 0xFF, 6, 0xFF, 6, 0xFF, 14, 0xFF, 14, 0xFF, 14, 0xFF, 14, 0xFF); alphaLo0 = Ssse3.Shuffle(lo0.AsByte(), alphaShuffle).AsUInt32(); alphaLo1 = Ssse3.Shuffle(lo1.AsByte(), alphaShuffle).AsUInt32(); alphaLo2 = Ssse3.Shuffle(lo2.AsByte(), alphaShuffle).AsUInt32(); alphaLo3 = Ssse3.Shuffle(lo3.AsByte(), alphaShuffle).AsUInt32(); alphaHi0 = Ssse3.Shuffle(hi0.AsByte(), alphaShuffle).AsUInt32(); alphaHi1 = Ssse3.Shuffle(hi1.AsByte(), alphaShuffle).AsUInt32(); alphaHi2 = Ssse3.Shuffle(hi2.AsByte(), alphaShuffle).AsUInt32(); alphaHi3 = Ssse3.Shuffle(hi3.AsByte(), alphaShuffle).AsUInt32(); } else { alphaLo0 = Sse2.UnpackLow(alpha0.AsByte(), Vector128 <byte> .Zero).AsUInt32(); alphaLo1 = Sse2.UnpackLow(alpha1.AsByte(), Vector128 <byte> .Zero).AsUInt32(); alphaLo2 = Sse2.UnpackLow(alpha2.AsByte(), Vector128 <byte> .Zero).AsUInt32(); alphaLo3 = Sse2.UnpackLow(alpha3.AsByte(), Vector128 <byte> .Zero).AsUInt32(); alphaHi0 = Sse2.UnpackHigh(alpha0.AsByte(), Vector128 <byte> .Zero).AsUInt32(); alphaHi1 = Sse2.UnpackHigh(alpha1.AsByte(), Vector128 <byte> .Zero).AsUInt32(); alphaHi2 = Sse2.UnpackHigh(alpha2.AsByte(), Vector128 <byte> .Zero).AsUInt32(); alphaHi3 = Sse2.UnpackHigh(alpha3.AsByte(), Vector128 <byte> .Zero).AsUInt32(); Vector128 <uint> alphaLo160 = Sse2.ShiftRightLogical(alphaLo0, 16); Vector128 <uint> alphaLo161 = Sse2.ShiftRightLogical(alphaLo1, 16); Vector128 <uint> alphaLo162 = Sse2.ShiftRightLogical(alphaLo2, 16); Vector128 <uint> alphaLo163 = Sse2.ShiftRightLogical(alphaLo3, 16); Vector128 <uint> alphaHi160 = Sse2.ShiftRightLogical(alphaHi0, 16); Vector128 <uint> alphaHi161 = Sse2.ShiftRightLogical(alphaHi1, 16); Vector128 <uint> alphaHi162 = Sse2.ShiftRightLogical(alphaHi2, 16); Vector128 <uint> alphaHi163 = Sse2.ShiftRightLogical(alphaHi3, 16); alphaLo0 = Sse2.Or(alphaLo0, alphaLo160); alphaLo1 = Sse2.Or(alphaLo1, alphaLo161); alphaLo2 = Sse2.Or(alphaLo2, alphaLo162); alphaLo3 = Sse2.Or(alphaLo3, alphaLo163); alphaHi0 = Sse2.Or(alphaHi0, alphaHi160); alphaHi1 = Sse2.Or(alphaHi1, alphaHi161); alphaHi2 = Sse2.Or(alphaHi2, alphaHi162); alphaHi3 = Sse2.Or(alphaHi3, alphaHi163); Vector128 <ulong> alphaLo320 = Sse2.ShiftRightLogical(alphaLo0.AsUInt64(), 32); Vector128 <ulong> alphaLo321 = Sse2.ShiftRightLogical(alphaLo1.AsUInt64(), 32); Vector128 <ulong> alphaLo322 = Sse2.ShiftRightLogical(alphaLo2.AsUInt64(), 32); Vector128 <ulong> alphaLo323 = Sse2.ShiftRightLogical(alphaLo3.AsUInt64(), 32); Vector128 <ulong> alphaHi320 = Sse2.ShiftRightLogical(alphaHi0.AsUInt64(), 32); Vector128 <ulong> alphaHi321 = Sse2.ShiftRightLogical(alphaHi1.AsUInt64(), 32); Vector128 <ulong> alphaHi322 = Sse2.ShiftRightLogical(alphaHi2.AsUInt64(), 32); Vector128 <ulong> alphaHi323 = Sse2.ShiftRightLogical(alphaHi3.AsUInt64(), 32); alphaLo0 = Sse2.Or(alphaLo0.AsUInt64(), alphaLo320).AsUInt32(); alphaLo1 = Sse2.Or(alphaLo1.AsUInt64(), alphaLo321).AsUInt32(); alphaLo2 = Sse2.Or(alphaLo2.AsUInt64(), alphaLo322).AsUInt32(); alphaLo3 = Sse2.Or(alphaLo3.AsUInt64(), alphaLo323).AsUInt32(); alphaHi0 = Sse2.Or(alphaHi0.AsUInt64(), alphaHi320).AsUInt32(); alphaHi1 = Sse2.Or(alphaHi1.AsUInt64(), alphaHi321).AsUInt32(); alphaHi2 = Sse2.Or(alphaHi2.AsUInt64(), alphaHi322).AsUInt32(); alphaHi3 = Sse2.Or(alphaHi3.AsUInt64(), alphaHi323).AsUInt32(); } Vector128 <ushort> prodLo0 = Sse2.MultiplyLow(lo0, alphaLo0.AsUInt16()); Vector128 <ushort> prodLo1 = Sse2.MultiplyLow(lo1, alphaLo1.AsUInt16()); Vector128 <ushort> prodLo2 = Sse2.MultiplyLow(lo2, alphaLo2.AsUInt16()); Vector128 <ushort> prodLo3 = Sse2.MultiplyLow(lo3, alphaLo3.AsUInt16()); Vector128 <ushort> prodHi0 = Sse2.MultiplyLow(hi0, alphaHi0.AsUInt16()); Vector128 <ushort> prodHi1 = Sse2.MultiplyLow(hi1, alphaHi1.AsUInt16()); Vector128 <ushort> prodHi2 = Sse2.MultiplyLow(hi2, alphaHi2.AsUInt16()); Vector128 <ushort> prodHi3 = Sse2.MultiplyLow(hi3, alphaHi3.AsUInt16()); Vector128 <ushort> addend = Vector128.Create((ushort)0x00FFU); var sumLo0 = Sse2.Add(prodLo0, addend); var sumLo1 = Sse2.Add(prodLo1, addend); var sumLo2 = Sse2.Add(prodLo2, addend); var sumLo3 = Sse2.Add(prodLo3, addend); var sumHi0 = Sse2.Add(prodHi0, addend); var sumHi1 = Sse2.Add(prodHi1, addend); var sumHi2 = Sse2.Add(prodHi2, addend); var sumHi3 = Sse2.Add(prodHi3, addend); var shiftLo0 = Sse2.ShiftRightLogical(sumLo0, 8); var shiftLo1 = Sse2.ShiftRightLogical(sumLo1, 8); var shiftLo2 = Sse2.ShiftRightLogical(sumLo2, 8); var shiftLo3 = Sse2.ShiftRightLogical(sumLo3, 8); var shiftHi0 = Sse2.ShiftRightLogical(sumHi0, 8); var shiftHi1 = Sse2.ShiftRightLogical(sumHi1, 8); var shiftHi2 = Sse2.ShiftRightLogical(sumHi2, 8); var shiftHi3 = Sse2.ShiftRightLogical(sumHi3, 8); var packed0 = Sse2.PackUnsignedSaturate(shiftLo0.AsInt16(), shiftHi0.AsInt16()).AsUInt32(); var packed1 = Sse2.PackUnsignedSaturate(shiftLo1.AsInt16(), shiftHi1.AsInt16()).AsUInt32(); var packed2 = Sse2.PackUnsignedSaturate(shiftLo2.AsInt16(), shiftHi2.AsInt16()).AsUInt32(); var packed3 = Sse2.PackUnsignedSaturate(shiftLo3.AsInt16(), shiftHi3.AsInt16()).AsUInt32(); var mask = Vector128.Create(0x00FFFFFFU); packed0 = Sse2.And(packed0, mask); packed1 = Sse2.And(packed1, mask); packed2 = Sse2.And(packed2, mask); packed3 = Sse2.And(packed3, mask); packed0 = Sse2.Or(packed0, alpha0); packed1 = Sse2.Or(packed1, alpha1); packed2 = Sse2.Or(packed2, alpha2); packed3 = Sse2.Or(packed3, alpha3); Sse2.Store(dataPtr + offset + 0x0, packed0); Sse2.Store(dataPtr + offset + 0x4, packed1); Sse2.Store(dataPtr + offset + 0x8, packed2); Sse2.Store(dataPtr + offset + 0xC, packed3); } } // This is unlikely to happen, but handle when there are still elements left (the texture size isn't aligned to 4) if (offset < data.Length) { ProcessTextureScalar(data.SliceUnsafe(offset)); } }
public static Vector128 <uint> RotateLeftUInt32_16(this Vector128 <uint> value) { return(Ssse3.IsSupported ? Ssse3.Shuffle(value.AsByte(), Rot16_128).AsUInt32() : value.RotateLeftUInt32(16)); }
public static unsafe int NeedsEscaping(ReadOnlySpan <char> value, JavaScriptEncoder encoder) { fixed(char *ptr = value) { int idx = 0; // Some implementations of JavascriptEncoder.FindFirstCharacterToEncode may not accept // null pointers and gaurd against that. Hence, check up-front and fall down to return -1. if (encoder != null && !value.IsEmpty) { idx = encoder.FindFirstCharacterToEncode(ptr, value.Length); goto Return; } #if BUILDING_INBOX_LIBRARY if (Sse2.IsSupported) { short *startingAddress = (short *)ptr; while (value.Length - 8 >= idx) { Debug.Assert(startingAddress >= ptr && startingAddress <= (ptr + value.Length - 8)); // Load the next 8 characters. Vector128 <short> sourceValue = Sse2.LoadVector128(startingAddress); // Check if any of the 8 characters need to be escaped. Vector128 <short> mask = CreateEscapingMask(sourceValue); int index = Sse2.MoveMask(mask.AsByte()); // If index == 0, that means none of the 8 characters needed to be escaped. // TrailingZeroCount is relatively expensive, avoid it if possible. if (index != 0) { // Found at least one character that needs to be escaped, figure out the index of // the first one found that needed to be escaped within the 8 characters. Debug.Assert(index > 0 && index <= 65_535); int tzc = BitOperations.TrailingZeroCount(index); Debug.Assert(tzc % 2 == 0 && tzc >= 0 && tzc <= 16); idx += tzc >> 1; goto Return; } idx += 8; startingAddress += 8; } // Process the remaining characters. Debug.Assert(value.Length - idx < 8); } #endif for (; idx < value.Length; idx++) { Debug.Assert((ptr + idx) <= (ptr + value.Length)); if (NeedsEscaping(*(ptr + idx))) { goto Return; } } idx = -1; // All characters are allowed. Return: return(idx); } }
public static Vector128 <T> ReverseEndianness128 <T>(this Vector128 <T> a) where T : struct { return(Ssse3.Shuffle(a.AsByte(), Reverse128).As <byte, T>()); }
public override unsafe int FindFirstCharacterToEncodeUtf8(ReadOnlySpan <byte> utf8Text) { fixed(byte *ptr = utf8Text) { int idx = 0; #if NETCOREAPP if (Sse2.IsSupported || AdvSimd.Arm64.IsSupported) { sbyte *startingAddress = (sbyte *)ptr; while (utf8Text.Length - 16 >= idx) { Debug.Assert(startingAddress >= ptr && startingAddress <= (ptr + utf8Text.Length - 16)); // Load the next 16 bytes. Vector128 <sbyte> sourceValue; bool containsNonAsciiBytes; // Check for ASCII text. Any byte that's not in the ASCII range will already be negative when // casted to signed byte. if (Sse2.IsSupported) { sourceValue = Sse2.LoadVector128(startingAddress); containsNonAsciiBytes = Sse2Helper.ContainsNonAsciiByte(sourceValue); } else { sourceValue = AdvSimd.LoadVector128(startingAddress); containsNonAsciiBytes = AdvSimdHelper.ContainsNonAsciiByte(sourceValue); } if (containsNonAsciiBytes) { // At least one of the following 16 bytes is non-ASCII. int processNextSixteen = idx + 16; Debug.Assert(processNextSixteen <= utf8Text.Length); while (idx < processNextSixteen) { Debug.Assert((ptr + idx) <= (ptr + utf8Text.Length)); if (UnicodeUtility.IsAsciiCodePoint(ptr[idx])) { if (!_allowedCharacters.IsUnicodeScalarAllowed(ptr[idx])) { goto Return; } idx++; } else { OperationStatus opStatus = UnicodeHelpers.DecodeScalarValueFromUtf8(utf8Text.Slice(idx), out uint nextScalarValue, out int utf8BytesConsumedForScalar); Debug.Assert(nextScalarValue <= int.MaxValue); if (opStatus != OperationStatus.Done || WillEncode((int)nextScalarValue)) { goto Return; } Debug.Assert(opStatus == OperationStatus.Done); idx += utf8BytesConsumedForScalar; } } startingAddress = (sbyte *)ptr + idx; } else { // Check if any of the 16 bytes need to be escaped. int index; if (Sse2.IsSupported) { Vector128 <sbyte> mask = Sse2Helper.CreateEscapingMask_UnsafeRelaxedJavaScriptEncoder(sourceValue); index = Sse2Helper.GetIndexOfFirstNonAsciiByte(mask.AsByte()); } else { Vector128 <sbyte> mask = AdvSimdHelper.CreateEscapingMask_UnsafeRelaxedJavaScriptEncoder(sourceValue); index = AdvSimdHelper.GetIndexOfFirstNonAsciiByte(mask.AsByte()); } // If index >= 16, that means none of the 16 bytes needed to be escaped. if (index < 16) { // Found at least one byte that needs to be escaped, figure out the index of // the first one found that needed to be escaped within the 16 bytes. idx += index; goto Return; } idx += 16; startingAddress += 16; } } // Process the remaining bytes. Debug.Assert(utf8Text.Length - idx < 16); } #endif while (idx < utf8Text.Length) { Debug.Assert((ptr + idx) <= (ptr + utf8Text.Length)); if (UnicodeUtility.IsAsciiCodePoint(ptr[idx])) { if (!_allowedCharacters.IsUnicodeScalarAllowed(ptr[idx])) { goto Return; } idx++; } else { OperationStatus opStatus = UnicodeHelpers.DecodeScalarValueFromUtf8(utf8Text.Slice(idx), out uint nextScalarValue, out int utf8BytesConsumedForScalar); Debug.Assert(nextScalarValue <= int.MaxValue); if (opStatus != OperationStatus.Done || WillEncode((int)nextScalarValue)) { goto Return; } Debug.Assert(opStatus == OperationStatus.Done); idx += utf8BytesConsumedForScalar; } } Debug.Assert(idx == utf8Text.Length); idx = -1; // All bytes are allowed. Return: return(idx); } }
public static unsafe void ChaCha20(uint *x, byte *m, byte *c, ulong bytes) { if (Avx2.IsSupported && bytes >= 512) { Vector256 <uint> x_0 = Vector256.Create(x[0]); Vector256 <uint> x_1 = Vector256.Create(x[1]); Vector256 <uint> x_2 = Vector256.Create(x[2]); Vector256 <uint> x_3 = Vector256.Create(x[3]); Vector256 <uint> x_4 = Vector256.Create(x[4]); Vector256 <uint> x_5 = Vector256.Create(x[5]); Vector256 <uint> x_6 = Vector256.Create(x[6]); Vector256 <uint> x_7 = Vector256.Create(x[7]); Vector256 <uint> x_8 = Vector256.Create(x[8]); Vector256 <uint> x_9 = Vector256.Create(x[9]); Vector256 <uint> x_10 = Vector256.Create(x[10]); Vector256 <uint> x_11 = Vector256.Create(x[11]); Vector256 <uint> x_12; Vector256 <uint> x_13; Vector256 <uint> x_14 = Vector256.Create(x[14]); Vector256 <uint> x_15 = Vector256.Create(x[15]); Vector256 <uint> orig0 = x_0; Vector256 <uint> orig1 = x_1; Vector256 <uint> orig2 = x_2; Vector256 <uint> orig3 = x_3; Vector256 <uint> orig4 = x_4; Vector256 <uint> orig5 = x_5; Vector256 <uint> orig6 = x_6; Vector256 <uint> orig7 = x_7; Vector256 <uint> orig8 = x_8; Vector256 <uint> orig9 = x_9; Vector256 <uint> orig10 = x_10; Vector256 <uint> orig11 = x_11; Vector256 <uint> orig12; Vector256 <uint> orig13; Vector256 <uint> orig14 = x_14; Vector256 <uint> orig15 = x_15; while (bytes >= 512) { Vector256 <uint> addv12 = Vector256.Create(0, 1, 2, 3).AsUInt32(); Vector256 <uint> addv13 = Vector256.Create(4, 5, 6, 7).AsUInt32(); Vector256 <uint> permute = Vector256.Create(0, 1, 4, 5, 2, 3, 6, 7).AsUInt32(); Vector256 <uint> t12, t13; x_0 = orig0; x_1 = orig1; x_2 = orig2; x_3 = orig3; x_4 = orig4; x_5 = orig5; x_6 = orig6; x_7 = orig7; x_8 = orig8; x_9 = orig9; x_10 = orig10; x_11 = orig11; x_14 = orig14; x_15 = orig15; uint in12 = x[12]; uint in13 = x[13]; ulong in1213 = in12 | ((ulong)in13 << 32); x_12 = x_13 = Avx2.BroadcastScalarToVector256(Sse2.X64.ConvertScalarToVector128UInt64(in1213)).AsUInt32(); t12 = Avx2.Add(addv12.AsUInt64(), x_12.AsUInt64()).AsUInt32(); t13 = Avx2.Add(addv13.AsUInt64(), x_13.AsUInt64()).AsUInt32(); x_12 = Avx2.UnpackLow(t12, t13); x_13 = Avx2.UnpackHigh(t12, t13); t12 = Avx2.UnpackLow(x_12, x_13); t13 = Avx2.UnpackHigh(x_12, x_13); x_12 = Avx2.PermuteVar8x32(t12, permute); x_13 = Avx2.PermuteVar8x32(t13, permute); orig12 = x_12; orig13 = x_13; in1213 += 8; x[12] = (uint)(in1213 & 0xFFFFFFFF); x[13] = (uint)((in1213 >> 32) & 0xFFFFFFFF); for (int i = 0; i < 20; i += 2) { Vec256Round(ref x_0, ref x_4, ref x_8, ref x_12, ref x_1, ref x_5, ref x_9, ref x_13, ref x_2, ref x_6, ref x_10, ref x_14, ref x_3, ref x_7, ref x_11, ref x_15); Vec256Round(ref x_0, ref x_5, ref x_10, ref x_15, ref x_1, ref x_6, ref x_11, ref x_12, ref x_2, ref x_7, ref x_8, ref x_13, ref x_3, ref x_4, ref x_9, ref x_14); } Vector256 <uint> t_0, t_1, t_2, t_3, t_4, t_5, t_6, t_7, t_8, t_9, t_10, t_11, t_12, t_13, t_14, t_15; t_0 = t_1 = t_2 = t_3 = t_4 = t_5 = t_6 = t_7 = t_8 = t_9 = t_10 = t_11 = t_12 = t_13 = t_14 = t_15 = Vector256.Create((uint)0); // ONEOCTO enter OneQuadUnpack(ref x_0, ref x_1, ref x_2, ref x_3, ref t_0, ref t_1, ref t_2, ref t_3, ref orig0, ref orig1, ref orig2, ref orig3); OneQuadUnpack(ref x_4, ref x_5, ref x_6, ref x_7, ref t_4, ref t_5, ref t_6, ref t_7, ref orig4, ref orig5, ref orig6, ref orig7); t_0 = Avx2.Permute2x128(x_0, x_4, 0x20); t_4 = Avx2.Permute2x128(x_0, x_4, 0x31); t_1 = Avx2.Permute2x128(x_1, x_5, 0x20); t_5 = Avx2.Permute2x128(x_1, x_5, 0x31); t_2 = Avx2.Permute2x128(x_2, x_6, 0x20); t_6 = Avx2.Permute2x128(x_2, x_6, 0x31); t_3 = Avx2.Permute2x128(x_3, x_7, 0x20); t_7 = Avx2.Permute2x128(x_3, x_7, 0x31); t_0 = Avx2.Xor(t_0, Avx.LoadVector256(m).AsUInt32()); t_1 = Avx2.Xor(t_1, Avx.LoadVector256(m + 64).AsUInt32()); t_2 = Avx2.Xor(t_2, Avx.LoadVector256(m + 128).AsUInt32()); t_3 = Avx2.Xor(t_3, Avx.LoadVector256(m + 192).AsUInt32()); t_4 = Avx2.Xor(t_4, Avx.LoadVector256(m + 256).AsUInt32()); t_5 = Avx2.Xor(t_5, Avx.LoadVector256(m + 320).AsUInt32()); t_6 = Avx2.Xor(t_6, Avx.LoadVector256(m + 384).AsUInt32()); t_7 = Avx2.Xor(t_7, Avx.LoadVector256(m + 448).AsUInt32()); Avx.Store(c, t_0.AsByte()); Avx.Store(c + 64, t_1.AsByte()); Avx.Store(c + 128, t_2.AsByte()); Avx.Store(c + 192, t_3.AsByte()); Avx.Store(c + 256, t_4.AsByte()); Avx.Store(c + 320, t_5.AsByte()); Avx.Store(c + 384, t_6.AsByte()); Avx.Store(c + 448, t_7.AsByte()); // ONEOCTO exit m += 32; c += 32; // ONEOCTO enter OneQuadUnpack(ref x_8, ref x_9, ref x_10, ref x_11, ref t_8, ref t_9, ref t_10, ref t_11, ref orig8, ref orig9, ref orig10, ref orig11); OneQuadUnpack(ref x_12, ref x_13, ref x_14, ref x_15, ref t_12, ref t_13, ref t_14, ref t_15, ref orig12, ref orig13, ref orig14, ref orig15); t_8 = Avx2.Permute2x128(x_8, x_12, 0x20); t_12 = Avx2.Permute2x128(x_8, x_12, 0x31); t_9 = Avx2.Permute2x128(x_9, x_13, 0x20); t_13 = Avx2.Permute2x128(x_9, x_13, 0x31); t_10 = Avx2.Permute2x128(x_10, x_14, 0x20); t_14 = Avx2.Permute2x128(x_10, x_14, 0x31); t_11 = Avx2.Permute2x128(x_11, x_15, 0x20); t_15 = Avx2.Permute2x128(x_11, x_15, 0x31); t_8 = Avx2.Xor(t_8, Avx.LoadVector256(m).AsUInt32()); t_9 = Avx2.Xor(t_9, Avx.LoadVector256(m + 64).AsUInt32()); t_10 = Avx2.Xor(t_10, Avx.LoadVector256(m + 128).AsUInt32()); t_11 = Avx2.Xor(t_11, Avx.LoadVector256(m + 192).AsUInt32()); t_12 = Avx2.Xor(t_12, Avx.LoadVector256(m + 256).AsUInt32()); t_13 = Avx2.Xor(t_13, Avx.LoadVector256(m + 320).AsUInt32()); t_14 = Avx2.Xor(t_14, Avx.LoadVector256(m + 384).AsUInt32()); t_15 = Avx2.Xor(t_15, Avx.LoadVector256(m + 448).AsUInt32()); Avx.Store(c, t_8.AsByte()); Avx.Store(c + 64, t_9.AsByte()); Avx.Store(c + 128, t_10.AsByte()); Avx.Store(c + 192, t_11.AsByte()); Avx.Store(c + 256, t_12.AsByte()); Avx.Store(c + 320, t_13.AsByte()); Avx.Store(c + 384, t_14.AsByte()); Avx.Store(c + 448, t_15.AsByte()); // ONEOCTO exit m -= 32; c -= 32; bytes -= 512; c += 512; m += 512; } } if (bytes >= 256) { Vector128 <uint> x_0 = Vector128.Create(x[0]); Vector128 <uint> x_1 = Vector128.Create(x[1]); Vector128 <uint> x_2 = Vector128.Create(x[2]); Vector128 <uint> x_3 = Vector128.Create(x[3]); Vector128 <uint> x_4 = Vector128.Create(x[4]); Vector128 <uint> x_5 = Vector128.Create(x[5]); Vector128 <uint> x_6 = Vector128.Create(x[6]); Vector128 <uint> x_7 = Vector128.Create(x[7]); Vector128 <uint> x_8 = Vector128.Create(x[8]); Vector128 <uint> x_9 = Vector128.Create(x[9]); Vector128 <uint> x_10 = Vector128.Create(x[10]); Vector128 <uint> x_11 = Vector128.Create(x[11]); Vector128 <uint> x_12; Vector128 <uint> x_13; Vector128 <uint> x_14 = Vector128.Create(x[14]); Vector128 <uint> x_15 = Vector128.Create(x[15]); Vector128 <uint> orig0 = x_0; Vector128 <uint> orig1 = x_1; Vector128 <uint> orig2 = x_2; Vector128 <uint> orig3 = x_3; Vector128 <uint> orig4 = x_4; Vector128 <uint> orig5 = x_5; Vector128 <uint> orig6 = x_6; Vector128 <uint> orig7 = x_7; Vector128 <uint> orig8 = x_8; Vector128 <uint> orig9 = x_9; Vector128 <uint> orig10 = x_10; Vector128 <uint> orig11 = x_11; Vector128 <uint> orig12; Vector128 <uint> orig13; Vector128 <uint> orig14 = x_14; Vector128 <uint> orig15 = x_15; Vector128 <uint> t12, t13; while (bytes >= 256) { Vector128 <uint> addv12 = Vector128.Create(0, 1).AsUInt32(); Vector128 <uint> addv13 = Vector128.Create(2, 3).AsUInt32(); x_0 = orig0; x_1 = orig1; x_2 = orig2; x_3 = orig3; x_4 = orig4; x_5 = orig5; x_6 = orig6; x_7 = orig7; x_8 = orig8; x_9 = orig9; x_10 = orig10; x_11 = orig11; x_14 = orig14; x_15 = orig15; uint in12 = x[12]; uint in13 = x[13]; ulong in1213 = in12 | ((ulong)in13) << 32; t12 = Vector128.Create(in1213).AsUInt32(); t13 = Vector128.Create(in1213).AsUInt32(); x_12 = Sse2.Add(Vector128.AsUInt64 <uint>(addv12), Vector128.AsUInt64 <uint>(t12)).AsUInt32(); x_13 = Sse2.Add(Vector128.AsUInt64 <uint>(addv13), Vector128.AsUInt64 <uint>(t13)).AsUInt32(); t12 = Sse2.UnpackLow(x_12, x_13); t13 = Sse2.UnpackHigh(x_12, x_13); x_12 = Sse2.UnpackLow(t12, t13); x_13 = Sse2.UnpackHigh(t12, t13); orig12 = x_12; orig13 = x_13; in1213 += 4; x[12] = (uint)(in1213 & 0xFFFFFFFF); x[13] = (uint)(in1213 >> 32 & 0xFFFFFFFF); for (int i = 0; i < 20; i += 2) { Vec128QuarterRound(ref x_0, ref x_4, ref x_8, ref x_12); Vec128QuarterRound(ref x_1, ref x_5, ref x_9, ref x_13); Vec128QuarterRound(ref x_2, ref x_6, ref x_10, ref x_14); Vec128QuarterRound(ref x_3, ref x_7, ref x_11, ref x_15); Vec128QuarterRound(ref x_0, ref x_5, ref x_10, ref x_15); Vec128QuarterRound(ref x_1, ref x_6, ref x_11, ref x_12); Vec128QuarterRound(ref x_2, ref x_7, ref x_8, ref x_13); Vec128QuarterRound(ref x_3, ref x_4, ref x_9, ref x_14); } OneQuad(ref x_0, ref x_1, ref x_2, ref x_3, ref orig0, ref orig1, ref orig2, ref orig3, m, c); m += 16; c += 16; OneQuad(ref x_4, ref x_5, ref x_6, ref x_7, ref orig4, ref orig5, ref orig6, ref orig7, m, c); m += 16; c += 16; OneQuad(ref x_8, ref x_9, ref x_10, ref x_11, ref orig8, ref orig9, ref orig10, ref orig11, m, c); m += 16; c += 16; OneQuad(ref x_12, ref x_13, ref x_14, ref x_15, ref orig12, ref orig13, ref orig14, ref orig15, m, c); m -= 48; c -= 48; bytes -= 256; c += 256; m += 256; } } while (bytes >= 64) { Vector128 <uint> x_0 = Sse2.LoadVector128(x); Vector128 <uint> x_1 = Sse2.LoadVector128(x + 4); Vector128 <uint> x_2 = Sse2.LoadVector128(x + 8); Vector128 <uint> x_3 = Sse2.LoadVector128(x + 12); Vector128 <uint> t_1; for (int i = 0; i < 20; i += 2) { x_0 = Sse2.Add(x_0, x_1); x_3 = Sse2.Xor(x_3, x_0); x_3 = Ssse3.Shuffle(x_3.AsByte(), rot16_128).AsUInt32(); x_2 = Sse2.Add(x_2, x_3); x_1 = Sse2.Xor(x_1, x_2); t_1 = x_1; x_1 = Sse2.ShiftLeftLogical(x_1, 12); t_1 = Sse2.ShiftRightLogical(t_1, 20); x_1 = Sse2.Xor(x_1, t_1); x_0 = Sse2.Add(x_0, x_1); x_3 = Sse2.Xor(x_3, x_0); x_0 = Sse2.Shuffle(x_0, 147); x_3 = Ssse3.Shuffle(x_3.AsByte(), rot8_128).AsUInt32(); x_2 = Sse2.Add(x_2, x_3); x_3 = Sse2.Shuffle(x_3, 78); x_1 = Sse2.Xor(x_1, x_2); x_2 = Sse2.Shuffle(x_2, 57); t_1 = x_1; x_1 = Sse2.ShiftLeftLogical(x_1, 7); t_1 = Sse2.ShiftRightLogical(t_1, 25); x_1 = Sse2.Xor(x_1, t_1); x_0 = Sse2.Add(x_0, x_1); x_3 = Sse2.Xor(x_3, x_0); x_3 = Ssse3.Shuffle(x_3.AsByte(), rot16_128).AsUInt32(); x_2 = Sse2.Add(x_2, x_3); x_1 = Sse2.Xor(x_1, x_2); t_1 = x_1; x_1 = Sse2.ShiftLeftLogical(x_1, 12); t_1 = Sse2.ShiftRightLogical(t_1, 20); x_1 = Sse2.Xor(x_1, t_1); x_0 = Sse2.Add(x_0, x_1); x_3 = Sse2.Xor(x_3, x_0); x_0 = Sse2.Shuffle(x_0, 57); x_3 = Ssse3.Shuffle(x_3.AsByte(), rot8_128).AsUInt32(); x_2 = Sse2.Add(x_2, x_3); x_3 = Sse2.Shuffle(x_3, 78); x_1 = Sse2.Xor(x_1, x_2); x_2 = Sse2.Shuffle(x_2, 147); t_1 = x_1; x_1 = Sse2.ShiftLeftLogical(x_1, 7); t_1 = Sse2.ShiftRightLogical(t_1, 25); x_1 = Sse2.Xor(x_1, t_1); } x_0 = Sse2.Add(x_0, Sse2.LoadVector128(x)); x_1 = Sse2.Add(x_1, Sse2.LoadVector128(x + 4)); x_2 = Sse2.Add(x_2, Sse2.LoadVector128(x + 8)); x_3 = Sse2.Add(x_3, Sse2.LoadVector128(x + 12)); x_0 = Sse2.Xor(x_0.AsByte(), Sse2.LoadVector128(m)).AsUInt32(); x_1 = Sse2.Xor(x_1.AsByte(), Sse2.LoadVector128(m + 16)).AsUInt32(); x_2 = Sse2.Xor(x_2.AsByte(), Sse2.LoadVector128(m + 32)).AsUInt32(); x_3 = Sse2.Xor(x_3.AsByte(), Sse2.LoadVector128(m + 48)).AsUInt32(); Sse2.Store(c, x_0.AsByte()); Sse2.Store(c + 16, x_1.AsByte()); Sse2.Store(c + 32, x_2.AsByte()); Sse2.Store(c + 48, x_3.AsByte()); uint in12 = x[12]; uint in13 = x[13]; in12++; if (in12 == 0) { in13++; } x[12] = in12; x[13] = in13; bytes -= 64; c += 64; m += 64; } if (bytes > 0) { Vector128 <uint> x_0 = Sse2.LoadVector128(x); Vector128 <uint> x_1 = Sse2.LoadVector128(x + 4); Vector128 <uint> x_2 = Sse2.LoadVector128(x + 8); Vector128 <uint> x_3 = Sse2.LoadVector128(x + 12); Vector128 <uint> t_1; for (int i = 0; i < 20; i += 2) { x_0 = Sse2.Add(x_0, x_1); x_3 = Sse2.Xor(x_3, x_0); x_3 = Ssse3.Shuffle(x_3.AsByte(), rot16_128).AsUInt32(); x_2 = Sse2.Add(x_2, x_3); x_1 = Sse2.Xor(x_1, x_2); t_1 = x_1; x_1 = Sse2.ShiftLeftLogical(x_1, 12); t_1 = Sse2.ShiftRightLogical(t_1, 20); x_1 = Sse2.Xor(x_1, t_1); x_0 = Sse2.Add(x_0, x_1); x_3 = Sse2.Xor(x_3, x_0); x_0 = Sse2.Shuffle(x_0, 0x93); x_3 = Ssse3.Shuffle(x_3.AsByte(), rot8_128).AsUInt32(); x_2 = Sse2.Add(x_2, x_3); x_3 = Sse2.Shuffle(x_3, 0x4e); x_1 = Sse2.Xor(x_1, x_2); x_2 = Sse2.Shuffle(x_2, 0x39); t_1 = x_1; x_1 = Sse2.ShiftLeftLogical(x_1, 7); t_1 = Sse2.ShiftRightLogical(t_1, 25); x_1 = Sse2.Xor(x_1, t_1); x_0 = Sse2.Add(x_0, x_1); x_3 = Sse2.Xor(x_3, x_0); x_3 = Ssse3.Shuffle(x_3.AsByte(), rot16_128).AsUInt32(); x_2 = Sse2.Add(x_2, x_3); x_1 = Sse2.Xor(x_1, x_2); t_1 = x_1; x_1 = Sse2.ShiftLeftLogical(x_1, 12); t_1 = Sse2.ShiftRightLogical(t_1, 20); x_1 = Sse2.Xor(x_1, t_1); x_0 = Sse2.Add(x_0, x_1); x_3 = Sse2.Xor(x_3, x_0); x_0 = Sse2.Shuffle(x_0, 0x39); x_3 = Ssse3.Shuffle(x_3.AsByte(), rot8_128).AsUInt32(); x_2 = Sse2.Add(x_2, x_3); x_3 = Sse2.Shuffle(x_3, 0x4e); x_1 = Sse2.Xor(x_1, x_2); x_2 = Sse2.Shuffle(x_2, 0x93); t_1 = x_1; x_1 = Sse2.ShiftLeftLogical(x_1, 7); t_1 = Sse2.ShiftRightLogical(t_1, 25); x_1 = Sse2.Xor(x_1, t_1); } x_0 = Sse2.Add(x_0, Sse2.LoadVector128(x)); x_1 = Sse2.Add(x_1, Sse2.LoadVector128(x + 4)); x_2 = Sse2.Add(x_2, Sse2.LoadVector128(x + 8)); x_3 = Sse2.Add(x_3, Sse2.LoadVector128(x + 12)); byte *partialblock = stackalloc byte[64]; Sse2.Store(partialblock, Vector128.AsByte(x_0)); Sse2.Store(partialblock + 16, Vector128.AsByte(x_1)); Sse2.Store(partialblock + 32, Vector128.AsByte(x_2)); Sse2.Store(partialblock + 48, Vector128.AsByte(x_3)); for (ulong i = 0; i < bytes; i++) { c[i] = (byte)(m[i] ^ partialblock[i]); } for (int n = 0; n < 64 / sizeof(int); n++) { ((int *)partialblock)[n] = 0; } } }
public virtual unsafe int FindFirstCharacterToEncodeUtf8(ReadOnlySpan <byte> utf8Text) { if (!_isAsciiCacheInitialized) { InitializeAsciiCache(); } // Loop through the input text, terminating when we see ill-formed UTF-8 or when we decode a scalar value // that must be encoded. If we see either of these things then we'll return its index in the original // input sequence. If we consume the entire text without seeing either of these, return -1 to indicate // that the text can be copied as-is without escaping. fixed(byte *ptr = utf8Text) { int idx = 0; #if NETCOREAPP if ((Sse2.IsSupported || AdvSimd.Arm64.IsSupported) && utf8Text.Length - 16 >= idx) { // Hoist these outside the loop, as the JIT won't do it. Vector128 <sbyte> bitMaskLookupAsciiNeedsEscaping = _bitMaskLookupAsciiNeedsEscaping; Vector128 <sbyte> bitPosLookup = Ssse3Helper.s_bitPosLookup; Vector128 <sbyte> nibbleMaskSByte = Ssse3Helper.s_nibbleMaskSByte; Vector128 <sbyte> nullMaskSByte = Ssse3Helper.s_nullMaskSByte; sbyte *startingAddress = (sbyte *)ptr; do { Debug.Assert(startingAddress >= ptr && startingAddress <= (ptr + utf8Text.Length - 16)); // Load the next 16 bytes. Vector128 <sbyte> sourceValue; bool containsNonAsciiBytes; // Check for ASCII text. Any byte that's not in the ASCII range will already be negative when // casted to signed byte. if (Sse2.IsSupported) { sourceValue = Sse2.LoadVector128(startingAddress); containsNonAsciiBytes = Sse2Helper.ContainsNonAsciiByte(sourceValue); } else if (AdvSimd.Arm64.IsSupported) { sourceValue = AdvSimd.LoadVector128(startingAddress); containsNonAsciiBytes = AdvSimdHelper.ContainsNonAsciiByte(sourceValue); } else { throw new PlatformNotSupportedException(); } if (!containsNonAsciiBytes) { // All of the following 16 bytes is ASCII. // TODO AdvSimd: optimization maybe achievable using VectorTableLookup and/or VectorTableLookupExtension if (Ssse3.IsSupported) { Vector128 <sbyte> mask = Ssse3Helper.CreateEscapingMask(sourceValue, bitMaskLookupAsciiNeedsEscaping, bitPosLookup, nibbleMaskSByte, nullMaskSByte); int index = Sse2Helper.GetIndexOfFirstNonAsciiByte(mask.AsByte()); if (index < 16) { idx += index; goto Return; } } else { byte *p = (byte *)startingAddress; if (DoesAsciiNeedEncoding(p[0])) { goto Return; } if (DoesAsciiNeedEncoding(p[1])) { goto Return1; } if (DoesAsciiNeedEncoding(p[2])) { goto Return2; } if (DoesAsciiNeedEncoding(p[3])) { goto Return3; } if (DoesAsciiNeedEncoding(p[4])) { goto Return4; } if (DoesAsciiNeedEncoding(p[5])) { goto Return5; } if (DoesAsciiNeedEncoding(p[6])) { goto Return6; } if (DoesAsciiNeedEncoding(p[7])) { goto Return7; } if (DoesAsciiNeedEncoding(p[8])) { goto Return8; } if (DoesAsciiNeedEncoding(p[9])) { goto Return9; } if (DoesAsciiNeedEncoding(p[10])) { goto Return10; } if (DoesAsciiNeedEncoding(p[11])) { goto Return11; } if (DoesAsciiNeedEncoding(p[12])) { goto Return12; } if (DoesAsciiNeedEncoding(p[13])) { goto Return13; } if (DoesAsciiNeedEncoding(p[14])) { goto Return14; } if (DoesAsciiNeedEncoding(p[15])) { goto Return15; } } idx += 16; } else { // At least one of the following 16 bytes is non-ASCII. int processNextSixteen = idx + 16; Debug.Assert(processNextSixteen <= utf8Text.Length); while (idx < processNextSixteen) { Debug.Assert((ptr + idx) <= (ptr + utf8Text.Length)); if (UnicodeUtility.IsAsciiCodePoint(ptr[idx])) { if (DoesAsciiNeedEncoding(ptr[idx])) { goto Return; } idx++; } else { OperationStatus opStatus = UnicodeHelpers.DecodeScalarValueFromUtf8(utf8Text.Slice(idx), out uint nextScalarValue, out int utf8BytesConsumedForScalar); Debug.Assert(nextScalarValue <= int.MaxValue); if (opStatus != OperationStatus.Done || WillEncode((int)nextScalarValue)) { goto Return; } Debug.Assert(opStatus == OperationStatus.Done); idx += utf8BytesConsumedForScalar; } } } startingAddress = (sbyte *)ptr + idx; }while (utf8Text.Length - 16 >= idx); // Process the remaining bytes. Debug.Assert(utf8Text.Length - idx < 16); } #endif while (idx < utf8Text.Length) { Debug.Assert((ptr + idx) <= (ptr + utf8Text.Length)); if (UnicodeUtility.IsAsciiCodePoint(ptr[idx])) { if (DoesAsciiNeedEncoding(ptr[idx])) { goto Return; } idx++; } else { OperationStatus opStatus = UnicodeHelpers.DecodeScalarValueFromUtf8(utf8Text.Slice(idx), out uint nextScalarValue, out int utf8BytesConsumedForScalar); Debug.Assert(nextScalarValue <= int.MaxValue); if (opStatus != OperationStatus.Done || WillEncode((int)nextScalarValue)) { goto Return; } Debug.Assert(opStatus == OperationStatus.Done); idx += utf8BytesConsumedForScalar; } } Debug.Assert(idx == utf8Text.Length); idx = -1; // All bytes are allowed. goto Return; #if NETCOREAPP Return15: return(idx + 15); Return14: return(idx + 14); Return13: return(idx + 13); Return12: return(idx + 12); Return11: return(idx + 11); Return10: return(idx + 10); Return9: return(idx + 9); Return8: return(idx + 8); Return7: return(idx + 7); Return6: return(idx + 6); Return5: return(idx + 5); Return4: return(idx + 4); Return3: return(idx + 3); Return2: return(idx + 2); Return1: return(idx + 1); #endif Return: return(idx); } }
public static unsafe int NeedsEscaping(ReadOnlySpan <byte> value, JavaScriptEncoder encoder) { fixed(byte *ptr = value) { int idx = 0; if (encoder != null) { idx = encoder.FindFirstCharacterToEncodeUtf8(value); goto Return; } #if BUILDING_INBOX_LIBRARY if (Sse2.IsSupported) { sbyte *startingAddress = (sbyte *)ptr; while (value.Length - 16 >= idx) { Debug.Assert(startingAddress >= ptr && startingAddress <= (ptr + value.Length - 16)); // Load the next 16 bytes. Vector128 <sbyte> sourceValue = Sse2.LoadVector128(startingAddress); // Check if any of the 16 bytes need to be escaped. Vector128 <sbyte> mask = CreateEscapingMask(sourceValue); int index = Sse2.MoveMask(mask.AsByte()); // If index == 0, that means none of the 16 bytes needed to be escaped. // TrailingZeroCount is relatively expensive, avoid it if possible. if (index != 0) { // Found at least one byte that needs to be escaped, figure out the index of // the first one found that needed to be escaped within the 16 bytes. Debug.Assert(index > 0 && index <= 65_535); int tzc = BitOperations.TrailingZeroCount(index); Debug.Assert(tzc >= 0 && tzc <= 16); idx += tzc; goto Return; } idx += 16; startingAddress += 16; } // Process the remaining characters. Debug.Assert(value.Length - idx < 16); } #endif for (; idx < value.Length; idx++) { Debug.Assert((ptr + idx) <= (ptr + value.Length)); if (NeedsEscaping(*(ptr + idx))) { goto Return; } } idx = -1; // all characters allowed Return: return(idx); } }