public static void Orn_V(ArmEmitterContext context) { if (Optimizations.UseSse2) { OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp; Operand n = GetVec(op.Rn); Operand m = GetVec(op.Rm); Operand mask = X86GetAllElements(context, -1L); Operand res = context.AddIntrinsic(Intrinsic.X86Pandn, m, mask); res = context.AddIntrinsic(Intrinsic.X86Por, res, n); if (op.RegisterSize == RegisterSize.Simd64) { res = context.VectorZeroUpper64(res); } context.Copy(GetVec(op.Rd), res); } else { EmitVectorBinaryOpZx(context, (op1, op2) => { return(context.BitwiseOr(op1, context.BitwiseNot(op2))); }); } }
public static Operand EmitScalarInsert(ArmEmitterContext context, Operand target, Operand value, int reg, bool doubleWidth) { Debug.Assert(target.Type == OperandType.V128 && value.Type == OperandType.V128); // Insert from index 0 in value to index in target. int index = reg & (doubleWidth ? 1 : 3); if (doubleWidth) { if (index == 1) { return(context.AddIntrinsic(Intrinsic.X86Movlhps, target, value)); // Low to high. } else { return(context.AddIntrinsic(Intrinsic.X86Shufpd, value, target, Const(2))); // Low to low, keep high from original. } } else { if (Optimizations.UseSse41) { return(context.AddIntrinsic(Intrinsic.X86Insertps, target, value, Const(index << 4))); } else { target = EmitSwapScalar(context, target, index, doubleWidth); // Swap value to replace into element 0. target = context.AddIntrinsic(Intrinsic.X86Movss, target, value); // Move the value into element 0 of the vector. return(EmitSwapScalar(context, target, index, doubleWidth)); // Swap new value back to the correct index. } } }
public static void Ushll_V(ArmEmitterContext context) { OpCodeSimdShImm op = (OpCodeSimdShImm)context.CurrOp; int shift = GetImmShl(op); if (Optimizations.UseSse41) { Operand n = GetVec(op.Rn); if (op.RegisterSize == RegisterSize.Simd128) { n = context.AddIntrinsic(Intrinsic.X86Psrldq, n, Const(8)); } Intrinsic movzxInst = X86PmovzxInstruction[op.Size]; Operand res = context.AddIntrinsic(movzxInst, n); if (shift != 0) { Intrinsic sllInst = X86PsllInstruction[op.Size + 1]; res = context.AddIntrinsic(sllInst, res, Const(shift)); } context.Copy(GetVec(op.Rd), res); } else { EmitVectorShImmWidenBinaryZx(context, (op1, op2) => context.ShiftLeft(op1, op2), shift); } }
public static void Bsl_V(ArmEmitterContext context) { if (Optimizations.UseSse2) { OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp; Operand d = GetVec(op.Rd); Operand n = GetVec(op.Rn); Operand m = GetVec(op.Rm); Operand res = context.AddIntrinsic(Intrinsic.X86Pxor, n, m); res = context.AddIntrinsic(Intrinsic.X86Pand, res, d); res = context.AddIntrinsic(Intrinsic.X86Pxor, res, m); if (op.RegisterSize == RegisterSize.Simd64) { res = context.VectorZeroUpper64(res); } context.Copy(GetVec(op.Rd), res); } else { EmitVectorTernaryOpZx(context, (op1, op2, op3) => { return(context.BitwiseExclusiveOr( context.BitwiseAnd(op1, context.BitwiseExclusiveOr(op2, op3)), op3)); }); } }
public static void Vrev(ArmEmitterContext context) { OpCode32SimdRev op = (OpCode32SimdRev)context.CurrOp; if (Optimizations.UseSsse3) { EmitVectorUnaryOpSimd32(context, (op1) => { Operand mask; switch (op.Size) { case 3: // Rev64 switch (op.Opc) { case 0: mask = X86GetElements(context, 0x08090a0b0c0d0e0fL, 0x0001020304050607L); return(context.AddIntrinsic(Intrinsic.X86Pshufb, op1, mask)); case 1: mask = X86GetElements(context, 0x09080b0a0d0c0f0eL, 0x0100030205040706L); return(context.AddIntrinsic(Intrinsic.X86Pshufb, op1, mask)); case 2: return(context.AddIntrinsic(Intrinsic.X86Shufps, op1, op1, Const(1 | (0 << 2) | (3 << 4) | (2 << 6)))); } break; case 2: // Rev32 switch (op.Opc) { case 0: mask = X86GetElements(context, 0x0c0d0e0f_08090a0bL, 0x04050607_00010203L); return(context.AddIntrinsic(Intrinsic.X86Pshufb, op1, mask));
public static void Usra_V(ArmEmitterContext context) { OpCodeSimdShImm op = (OpCodeSimdShImm)context.CurrOp; if (Optimizations.UseSse2 && op.Size > 0) { int shift = GetImmShr(op); Operand d = GetVec(op.Rd); Operand n = GetVec(op.Rn); Intrinsic srlInst = X86PsrlInstruction[op.Size]; Operand res = context.AddIntrinsic(srlInst, n, Const(shift)); Intrinsic addInst = X86PaddInstruction[op.Size]; res = context.AddIntrinsic(addInst, res, d); if (op.RegisterSize == RegisterSize.Simd64) { res = context.VectorZeroUpper64(res); } context.Copy(d, res); } else { EmitVectorShrImmOpZx(context, ShrImmFlags.Accumulate); } }
public static void Vmin_I(ArmEmitterContext context) { OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp; if (op.U) { if (Optimizations.UseSse2) { EmitVectorBinaryOpSimd32(context, (op1, op2) => context.AddIntrinsic(X86PminuInstruction[op.Size], op1, op2)); } else { EmitVectorBinaryOpZx32(context, (op1, op2) => context.ConditionalSelect(context.ICompareLessUI(op1, op2), op1, op2)); } } else { if (Optimizations.UseSse2) { EmitVectorBinaryOpSimd32(context, (op1, op2) => context.AddIntrinsic(X86PminsInstruction[op.Size], op1, op2)); } else { EmitVectorBinaryOpSx32(context, (op1, op2) => context.ConditionalSelect(context.ICompareLess(op1, op2), op1, op2)); } } }
public static void Vneg_V(ArmEmitterContext context) { OpCode32SimdCmpZ op = (OpCode32SimdCmpZ)context.CurrOp; if (op.F) { if (Optimizations.FastFP && Optimizations.UseSse2) { EmitVectorUnaryOpSimd32(context, (m) => { if ((op.Size & 1) == 0) { Operand mask = X86GetAllElements(context, -0f); return(context.AddIntrinsic(Intrinsic.X86Xorps, mask, m)); } else { Operand mask = X86GetAllElements(context, -0d); return(context.AddIntrinsic(Intrinsic.X86Xorpd, mask, m)); } }); } else { EmitVectorUnaryOpF32(context, (op1) => context.Negate(op1)); } } else { EmitVectorUnaryOpSx32(context, (op1) => context.Negate(op1)); } }
public static void Vnmul_S(ArmEmitterContext context) { OpCode32SimdRegS op = (OpCode32SimdRegS)context.CurrOp; if (Optimizations.UseSse2) { EmitScalarBinaryOpSimd32(context, (n, m) => { if ((op.Size & 1) == 0) { Operand res = context.AddIntrinsic(Intrinsic.X86Mulss, n, m); Operand mask = X86GetScalar(context, -0f); return(context.AddIntrinsic(Intrinsic.X86Xorps, mask, res)); } else { Operand res = context.AddIntrinsic(Intrinsic.X86Mulsd, n, m); Operand mask = X86GetScalar(context, -0d); return(context.AddIntrinsic(Intrinsic.X86Xorpd, mask, res)); } }); } else { EmitScalarBinaryOpF32(context, (op1, op2) => context.Negate(context.Multiply(op1, op2))); } }
public static void EmitScalarTernaryOpF32( ArmEmitterContext context, Intrinsic inst32pt1, Intrinsic inst64pt1, Intrinsic inst32pt2, Intrinsic inst64pt2, bool isNegD = false) { OpCode32SimdRegS op = (OpCode32SimdRegS)context.CurrOp; bool doubleSize = (op.Size & 1) != 0; Intrinsic inst1 = doubleSize ? inst64pt1 : inst32pt1; Intrinsic inst2 = doubleSize ? inst64pt2 : inst32pt2; EmitScalarTernaryOpSimd32(context, (d, n, m) => { Operand res = context.AddIntrinsic(inst1, n, m); if (isNegD) { Operand mask = doubleSize ? X86GetScalar(context, -0d) : X86GetScalar(context, -0f); d = doubleSize ? context.AddIntrinsic(Intrinsic.X86Xorpd, mask, d) : context.AddIntrinsic(Intrinsic.X86Xorps, mask, d); } return(context.AddIntrinsic(inst2, d, res)); }); }
public static void Shrn_V(ArmEmitterContext context) { if (Optimizations.UseSsse3) { OpCodeSimdShImm op = (OpCodeSimdShImm)context.CurrOp; int shift = GetImmShr(op); Operand d = GetVec(op.Rd); Operand n = GetVec(op.Rn); Operand dLow = context.AddIntrinsic(Intrinsic.X86Movlhps, d, context.VectorZero()); Intrinsic srlInst = X86PsrlInstruction[op.Size + 1]; Operand nShifted = context.AddIntrinsic(srlInst, n, Const(shift)); Operand mask = X86GetAllElements(context, _masks_RshrnShrn[op.Size]); Operand res = context.AddIntrinsic(Intrinsic.X86Pshufb, nShifted, mask); Intrinsic movInst = op.RegisterSize == RegisterSize.Simd128 ? Intrinsic.X86Movlhps : Intrinsic.X86Movhlps; res = context.AddIntrinsic(movInst, dLow, res); context.Copy(d, res); } else { EmitVectorShrImmNarrowOpZx(context, round: false); } }
public static void Cmhs_V(ArmEmitterContext context) { OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp; if (Optimizations.UseSse41 && op.Size < 3) { Operand n = GetVec(op.Rn); Operand m = GetVec(op.Rm); Intrinsic maxInst = X86PmaxuInstruction[op.Size]; Operand res = context.AddIntrinsic(maxInst, n, m); Intrinsic cmpInst = X86PcmpeqInstruction[op.Size]; res = context.AddIntrinsic(cmpInst, res, n); if (op.RegisterSize == RegisterSize.Simd64) { res = context.VectorZeroUpper64(res); } context.Copy(GetVec(op.Rd), res); } else { EmitCmpOp(context, (op1, op2) => context.ICompareGreaterOrEqualUI(op1, op2), scalar: false); } }
public static void Aesmc_V(ArmEmitterContext context) { OpCode32Simd op = (OpCode32Simd)context.CurrOp; Operand n = GetVecA32(op.Qm); Operand res; if (Optimizations.UseAesni) { Operand roundKey = context.VectorZero(); // Inverse Shift Rows, Inverse Sub Bytes, xor 0 so nothing happens. res = context.AddIntrinsic(Intrinsic.X86Aesdeclast, n, roundKey); // Shift Rows, Sub Bytes, Mix Columns (!), xor 0 so nothing happens. res = context.AddIntrinsic(Intrinsic.X86Aesenc, res, roundKey); } else { res = context.Call(new _V128_V128(SoftFallback.MixColumns), n); } context.Copy(GetVecA32(op.Qd), res); }
private static void EmitBifBit(ArmEmitterContext context, bool notRm) { OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp; if (Optimizations.UseSse2) { EmitVectorTernaryOpSimd32(context, (d, n, m) => { Operand res = context.AddIntrinsic(Intrinsic.X86Pxor, n, d); res = context.AddIntrinsic((notRm) ? Intrinsic.X86Pandn : Intrinsic.X86Pand, m, res); return(context.AddIntrinsic(Intrinsic.X86Pxor, d, res)); }); } else { EmitVectorTernaryOpZx32(context, (d, n, m) => { if (notRm) { m = context.BitwiseNot(m); } return(context.BitwiseExclusiveOr( context.BitwiseAnd(m, context.BitwiseExclusiveOr(d, n)), d)); }); } }
public static void Cmle_V(ArmEmitterContext context) { if (Optimizations.UseSse42) { OpCodeSimd op = (OpCodeSimd)context.CurrOp; Operand n = GetVec(op.Rn); Intrinsic cmpInst = X86PcmpgtInstruction[op.Size]; Operand res = context.AddIntrinsic(cmpInst, n, context.VectorZero()); Operand mask = X86GetAllElements(context, -1L); res = context.AddIntrinsic(Intrinsic.X86Pandn, res, mask); if (op.RegisterSize == RegisterSize.Simd64) { res = context.VectorZeroUpper64(res); } context.Copy(GetVec(op.Rd), res); } else { EmitCmpOp(context, (op1, op2) => context.ICompareLessOrEqual(op1, op2), scalar: false); } }
public static void Vneg_S(ArmEmitterContext context) { OpCode32SimdS op = (OpCode32SimdS)context.CurrOp; if (Optimizations.UseSse2) { EmitScalarUnaryOpSimd32(context, (m) => { if ((op.Size & 1) == 0) { Operand mask = X86GetScalar(context, -0f); return(context.AddIntrinsic(Intrinsic.X86Xorps, mask, m)); } else { Operand mask = X86GetScalar(context, -0d); return(context.AddIntrinsic(Intrinsic.X86Xorpd, mask, m)); } }); } else { EmitScalarUnaryOpF32(context, (op1) => context.Negate(op1)); } }
public static void Fcvtn_V(ArmEmitterContext context) { OpCodeSimd op = (OpCodeSimd)context.CurrOp; int sizeF = op.Size & 1; if (Optimizations.UseSse2 && sizeF == 1) { Operand d = GetVec(op.Rd); Operand n = GetVec(op.Rn); Operand res = context.AddIntrinsic(Intrinsic.X86Movlhps, d, context.VectorZero()); Operand nInt = context.AddIntrinsic(Intrinsic.X86Cvtpd2ps, n); nInt = context.AddIntrinsic(Intrinsic.X86Movlhps, nInt, nInt); Intrinsic movInst = op.RegisterSize == RegisterSize.Simd128 ? Intrinsic.X86Movlhps : Intrinsic.X86Movhlps; res = context.AddIntrinsic(movInst, res, nInt); context.Copy(GetVec(op.Rd), res); } else { OperandType type = sizeF == 0 ? OperandType.FP32 : OperandType.FP64; int elems = 4 >> sizeF; int part = op.RegisterSize == RegisterSize.Simd128 ? elems : 0; Operand res = part == 0 ? context.VectorZero() : context.Copy(GetVec(op.Rd)); for (int index = 0; index < elems; index++) { Operand ne = context.VectorExtract(type, GetVec(op.Rn), 0); if (sizeF == 0) { Delegate dlg = new _U16_F32(SoftFloat32_16.FPConvert); Operand e = context.Call(dlg, ne); e = context.ZeroExtend16(OperandType.I64, e); res = EmitVectorInsert(context, res, e, part + index, 1); } else /* if (sizeF == 1) */ { Operand e = context.ConvertToFP(OperandType.FP32, ne); res = context.VectorInsert(res, e, part + index); } } context.Copy(GetVec(op.Rd), res); } }
private static void EmitSse2Scvtf(ArmEmitterContext context, bool scalar) { OpCodeSimd op = (OpCodeSimd)context.CurrOp; Operand n = GetVec(op.Rn); Operand res = context.AddIntrinsic(Intrinsic.X86Cvtdq2ps, n); if (op is OpCodeSimdShImm fixedOp) { int fBits = GetImmShr(fixedOp); // BitConverter.Int32BitsToSingle(fpScaled) == 1f / MathF.Pow(2f, fBits) int fpScaled = 0x3F800000 - fBits * 0x800000; Operand scale = X86GetAllElements(context, fpScaled); res = context.AddIntrinsic(Intrinsic.X86Mulps, res, scale); } if (scalar) { res = context.VectorZeroUpper96(res); } else if (op.RegisterSize == RegisterSize.Simd64) { res = context.VectorZeroUpper64(res); } context.Copy(GetVec(op.Rd), res); }
private static void EmitBifBit(ArmEmitterContext context, bool notRm) { OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp; if (Optimizations.UseSse2) { Operand d = GetVec(op.Rd); Operand n = GetVec(op.Rn); Operand m = GetVec(op.Rm); Operand res = context.AddIntrinsic(Intrinsic.X86Pxor, n, d); if (notRm) { res = context.AddIntrinsic(Intrinsic.X86Pandn, m, res); } else { res = context.AddIntrinsic(Intrinsic.X86Pand, m, res); } res = context.AddIntrinsic(Intrinsic.X86Pxor, d, res); if (op.RegisterSize == RegisterSize.Simd64) { res = context.VectorZeroUpper64(res); } context.Copy(GetVec(op.Rd), res); } else { Operand res = context.VectorZero(); int elems = op.RegisterSize == RegisterSize.Simd128 ? 2 : 1; for (int index = 0; index < elems; index++) { Operand d = EmitVectorExtractZx(context, op.Rd, index, 3); Operand n = EmitVectorExtractZx(context, op.Rn, index, 3); Operand m = EmitVectorExtractZx(context, op.Rm, index, 3); if (notRm) { m = context.BitwiseNot(m); } Operand e = context.BitwiseExclusiveOr(d, n); e = context.BitwiseAnd(e, m); e = context.BitwiseExclusiveOr(e, d); res = EmitVectorInsert(context, res, e, index, 3); } context.Copy(GetVec(op.Rd), res); } }
public static void Fcvtl_V(ArmEmitterContext context) { OpCodeSimd op = (OpCodeSimd)context.CurrOp; int sizeF = op.Size & 1; if (Optimizations.UseSse2 && sizeF == 1) { Operand n = GetVec(op.Rn); Operand res; if (op.RegisterSize == RegisterSize.Simd128) { res = context.AddIntrinsic(Intrinsic.X86Movhlps, n, n); } else { res = n; } res = context.AddIntrinsic(Intrinsic.X86Cvtps2pd, res); context.Copy(GetVec(op.Rd), res); } else { Operand res = context.VectorZero(); int elems = 4 >> sizeF; int part = op.RegisterSize == RegisterSize.Simd128 ? elems : 0; for (int index = 0; index < elems; index++) { if (sizeF == 0) { Operand ne = EmitVectorExtractZx(context, op.Rn, part + index, 1); Delegate dlg = new _F32_U16(SoftFloat16_32.FPConvert); Operand e = context.Call(dlg, ne); res = context.VectorInsert(res, e, index); } else /* if (sizeF == 1) */ { Operand ne = context.VectorExtract(OperandType.FP32, GetVec(op.Rn), part + index); Operand e = context.ConvertToFP(OperandType.FP64, ne); res = context.VectorInsert(res, e, index); } } context.Copy(GetVec(op.Rd), res); } }
public static void Vext(ArmEmitterContext context) { OpCode32SimdExt op = (OpCode32SimdExt)context.CurrOp; int elems = op.GetBytesCount(); int byteOff = op.Immediate; if (Optimizations.UseSsse3) { EmitVectorBinaryOpSimd32(context, (n, m) => { // Writing low to high of d: start <imm> into n, overlap into m. // Then rotate n down by <imm>, m up by (elems)-imm. // Then OR them together for the result. (long nMaskHigh, long nMaskLow) = MaskHelperByteSequence(0, elems - byteOff, byteOff); (long mMaskHigh, long mMaskLow) = MaskHelperByteSequence(elems - byteOff, byteOff, 0); Operand nMask, mMask; if (!op.Q) { // Do the same operation to the bytes in the top doubleword too, as our target could be in either. nMaskHigh = nMaskLow + 0x0808080808080808L; mMaskHigh = mMaskLow + 0x0808080808080808L; } nMask = X86GetElements(context, nMaskHigh, nMaskLow); mMask = X86GetElements(context, mMaskHigh, mMaskLow); Operand nPart = context.AddIntrinsic(Intrinsic.X86Pshufb, n, nMask); Operand mPart = context.AddIntrinsic(Intrinsic.X86Pshufb, m, mMask); return(context.AddIntrinsic(Intrinsic.X86Por, nPart, mPart)); }); } else { Operand res = GetVecA32(op.Qd); for (int index = 0; index < elems; index++) { Operand extract; if (byteOff >= elems) { extract = EmitVectorExtractZx32(context, op.Qm, op.Im + (byteOff - elems), op.Size); } else { extract = EmitVectorExtractZx32(context, op.Qn, op.In + byteOff, op.Size); } byteOff++; res = EmitVectorInsert(context, res, extract, op.Id + index, op.Size); } context.Copy(GetVecA32(op.Qd), res); } }
private static void EmitSri(ArmEmitterContext context, bool scalar) { OpCodeSimdShImm op = (OpCodeSimdShImm)context.CurrOp; int shift = GetImmShr(op); int eSize = 8 << op.Size; ulong mask = (ulong.MaxValue << (eSize - shift)) & (ulong.MaxValue >> (64 - eSize)); if (Optimizations.UseSse2 && op.Size > 0) { Operand d = GetVec(op.Rd); Operand n = GetVec(op.Rn); Intrinsic srlInst = X86PsrlInstruction[op.Size]; Operand nShifted = context.AddIntrinsic(srlInst, n, Const(shift)); Operand dMask = X86GetAllElements(context, (long)mask * _masks_SliSri[op.Size]); Operand dMasked = context.AddIntrinsic(Intrinsic.X86Pand, d, dMask); Operand res = context.AddIntrinsic(Intrinsic.X86Por, nShifted, dMasked); if ((op.RegisterSize == RegisterSize.Simd64) || scalar) { res = context.VectorZeroUpper64(res); } context.Copy(d, res); } else { Operand res = context.VectorZero(); int elems = !scalar?op.GetBytesCount() >> op.Size : 1; for (int index = 0; index < elems; index++) { Operand ne = EmitVectorExtractZx(context, op.Rn, index, op.Size); Operand neShifted = shift != 64 ? context.ShiftRightUI(ne, Const(shift)) : Const(0UL); Operand de = EmitVectorExtractZx(context, op.Rd, index, op.Size); Operand deMasked = context.BitwiseAnd(de, Const(mask)); Operand e = context.BitwiseOr(neShifted, deMasked); res = EmitVectorInsert(context, res, e, index, op.Size); } context.Copy(GetVec(op.Rd), res); } }
private static void EmitVectorZip(ArmEmitterContext context, int part) { OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp; if (Optimizations.UseSse2) { Operand n = GetVec(op.Rn); Operand m = GetVec(op.Rm); if (op.RegisterSize == RegisterSize.Simd128) { Intrinsic punpckInst = part == 0 ? X86PunpcklInstruction[op.Size] : X86PunpckhInstruction[op.Size]; Operand res = context.AddIntrinsic(punpckInst, n, m); context.Copy(GetVec(op.Rd), res); } else { Operand res = context.AddIntrinsic(X86PunpcklInstruction[op.Size], n, m); Intrinsic punpckInst = part == 0 ? Intrinsic.X86Punpcklqdq : Intrinsic.X86Punpckhqdq; res = context.AddIntrinsic(punpckInst, res, context.VectorZero()); context.Copy(GetVec(op.Rd), res); } } else { Operand res = context.VectorZero(); int pairs = op.GetPairsCount() >> op.Size; int baseIndex = part != 0 ? pairs : 0; for (int index = 0; index < pairs; index++) { int pairIndex = index << 1; Operand ne = EmitVectorExtractZx(context, op.Rn, baseIndex + index, op.Size); Operand me = EmitVectorExtractZx(context, op.Rm, baseIndex + index, op.Size); res = EmitVectorInsert(context, res, ne, pairIndex, op.Size); res = EmitVectorInsert(context, res, me, pairIndex + 1, op.Size); } context.Copy(GetVec(op.Rd), res); } }
public static void EmitVectorsByScalarOpF32(ArmEmitterContext context, Intrinsic inst32pt1, Intrinsic inst64pt1, Intrinsic inst32pt2, Intrinsic inst64pt2) { OpCode32SimdRegElem op = (OpCode32SimdRegElem)context.CurrOp; Intrinsic inst1 = (op.Size & 1) != 0 ? inst64pt1 : inst32pt1; Intrinsic inst2 = (op.Size & 1) != 0 ? inst64pt2 : inst32pt2; EmitVectorsByScalarOpSimd32(context, (d, n, m) => { Operand res = context.AddIntrinsic(inst1, n, m); return(res = context.AddIntrinsic(inst2, d, res)); }); }
public static void Dup_Gp(ArmEmitterContext context) { OpCodeSimdIns op = (OpCodeSimdIns)context.CurrOp; Operand n = GetIntOrZR(context, op.Rn); if (Optimizations.UseSse2) { switch (op.Size) { case 0: n = context.ZeroExtend8(n.Type, n); n = context.Multiply(n, Const(n.Type, 0x01010101)); break; case 1: n = context.ZeroExtend16(n.Type, n); n = context.Multiply(n, Const(n.Type, 0x00010001)); break; case 2: n = context.ZeroExtend32(n.Type, n); break; } Operand res = context.VectorInsert(context.VectorZero(), n, 0); if (op.Size < 3) { if (op.RegisterSize == RegisterSize.Simd64) { res = context.AddIntrinsic(Intrinsic.X86Shufps, res, res, Const(0xf0)); } else { res = context.AddIntrinsic(Intrinsic.X86Shufps, res, res, Const(0)); } } else { res = context.AddIntrinsic(Intrinsic.X86Movlhps, res, res); } context.Copy(GetVec(op.Rd), res); } else { Operand res = context.VectorZero(); int elems = op.GetBytesCount() >> op.Size; for (int index = 0; index < elems; index++) { res = EmitVectorInsert(context, res, n, index, op.Size); } context.Copy(GetVec(op.Rd), res); } }
public static void Rshrn_V(ArmEmitterContext context) { if (Optimizations.UseSsse3) { OpCodeSimdShImm op = (OpCodeSimdShImm)context.CurrOp; int shift = GetImmShr(op); long roundConst = 1L << (shift - 1); Operand d = GetVec(op.Rd); Operand n = GetVec(op.Rn); Operand dLow = context.AddIntrinsic(Intrinsic.X86Movlhps, d, context.VectorZero()); Operand mask = null; switch (op.Size + 1) { case 1: mask = X86GetAllElements(context, (int)roundConst * 0x00010001); break; case 2: mask = X86GetAllElements(context, (int)roundConst); break; case 3: mask = X86GetAllElements(context, roundConst); break; } Intrinsic addInst = X86PaddInstruction[op.Size + 1]; Operand res = context.AddIntrinsic(addInst, n, mask); Intrinsic srlInst = X86PsrlInstruction[op.Size + 1]; res = context.AddIntrinsic(srlInst, res, Const(shift)); Operand mask2 = X86GetAllElements(context, _masks_RshrnShrn[op.Size]); res = context.AddIntrinsic(Intrinsic.X86Pshufb, res, mask2); Intrinsic movInst = op.RegisterSize == RegisterSize.Simd128 ? Intrinsic.X86Movlhps : Intrinsic.X86Movhlps; res = context.AddIntrinsic(movInst, dLow, res); context.Copy(d, res); } else { EmitVectorShrImmNarrowOpZx(context, round: true); } }
public static void Ext_V(ArmEmitterContext context) { OpCodeSimdExt op = (OpCodeSimdExt)context.CurrOp; if (Optimizations.UseSse2) { Operand nShifted = GetVec(op.Rn); if (op.RegisterSize == RegisterSize.Simd64) { nShifted = context.VectorZeroUpper64(nShifted); } nShifted = context.AddIntrinsic(Intrinsic.X86Psrldq, nShifted, Const(op.Imm4)); Operand mShifted = GetVec(op.Rm); mShifted = context.AddIntrinsic(Intrinsic.X86Pslldq, mShifted, Const(op.GetBytesCount() - op.Imm4)); if (op.RegisterSize == RegisterSize.Simd64) { mShifted = context.VectorZeroUpper64(mShifted); } Operand res = context.AddIntrinsic(Intrinsic.X86Por, nShifted, mShifted); context.Copy(GetVec(op.Rd), res); } else { Operand res = context.VectorZero(); int bytes = op.GetBytesCount(); int position = op.Imm4 & (bytes - 1); for (int index = 0; index < bytes; index++) { int reg = op.Imm4 + index < bytes ? op.Rn : op.Rm; Operand e = EmitVectorExtractZx(context, reg, position, 0); position = (position + 1) & (bytes - 1); res = EmitVectorInsert(context, res, e, index, 0); } context.Copy(GetVec(op.Rd), res); } }
private static void EmitCrc32Optimized64(ArmEmitterContext context, bool castagnoli) { OpCodeAluBinary op = (OpCodeAluBinary)context.CurrOp; long mu = castagnoli ? 0x0DEA713F1 : 0x1F7011641; // mu' = floor(x^64/P(x))' long polynomial = castagnoli ? 0x105EC76F0 : 0x1DB710641; // P'(x) << 1 Operand crc = GetIntOrZR(context, op.Rn); Operand data = GetIntOrZR(context, op.Rm); crc = context.VectorInsert(context.VectorZero(), crc, 0); data = context.VectorInsert(context.VectorZero(), data, 0); Operand tmp = context.AddIntrinsic(Intrinsic.X86Pxor, crc, data); Operand res = context.AddIntrinsic(Intrinsic.X86Pslldq, tmp, Const(4)); tmp = context.AddIntrinsic(Intrinsic.X86Pclmulqdq, res, X86GetScalar(context, mu), Const(0)); tmp = context.AddIntrinsic(Intrinsic.X86Pclmulqdq, tmp, X86GetScalar(context, polynomial), Const(0)); tmp = context.AddIntrinsic(Intrinsic.X86Pxor, tmp, res); tmp = context.AddIntrinsic(Intrinsic.X86Psllq, tmp, Const(32)); tmp = context.AddIntrinsic(Intrinsic.X86Pclmulqdq, tmp, X86GetScalar(context, mu), Const(1)); tmp = context.AddIntrinsic(Intrinsic.X86Pclmulqdq, tmp, X86GetScalar(context, polynomial), Const(0)); SetIntOrZR(context, op.Rd, context.VectorExtract(OperandType.I32, tmp, 2)); }
public static Operand EmitSha256su1(ArmEmitterContext context, Operand x, Operand y, Operand z) { if (Optimizations.UseSha && Optimizations.UseSsse3) { Operand extr = context.AddIntrinsic(Intrinsic.X86Palignr, z, y, Const(4)); Operand tmp = context.AddIntrinsic(Intrinsic.X86Paddd, extr, x); Operand res = context.AddIntrinsic(Intrinsic.X86Sha256Msg2, tmp, z); return(res); } return(context.Call(typeof(SoftFallback).GetMethod(nameof(SoftFallback.Sha256SchedulePart2)), x, y, z)); }
// Pairwise public static void EmitSse2VectorPairwiseOpF32(ArmEmitterContext context, Intrinsic inst32) { OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp; EmitVectorBinaryOpSimd32(context, (n, m) => { Operand unpck = context.AddIntrinsic(Intrinsic.X86Unpcklps, n, m); Operand part0 = unpck; Operand part1 = context.AddIntrinsic(Intrinsic.X86Movhlps, unpck, unpck); return(context.AddIntrinsic(inst32, part0, part1)); }, 0); }