From 0f5b6dfbe8d4bcc4df3f670e366a967d8ea103db Mon Sep 17 00:00:00 2001 From: LDj3SNuD <35856442+LDj3SNuD@users.noreply.github.com> Date: Wed, 26 Dec 2018 18:11:36 +0100 Subject: [PATCH] Fix Frecpe_S/V and Frsqrte_S/V (full FP emu.). Add Sse Opt. & SoftFloat Impl. for Fcmeq/ge/gt/le/lt_S/V (Reg & Zero), Faddp_S/V, Fmaxp_V, Fminp_V Inst.; add Sse Opt. for Shll_V, S/Ushll_V Inst.; improve Sse Opt. for Xtn_V Inst.. Add Tests. (#543) * Update Optimizations.cs * Update InstEmitSimdShift.cs * Update InstEmitSimdHelper.cs * Update InstEmitSimdArithmetic.cs * Update InstEmitSimdMove.cs * Update SoftFloat.cs * Update InstEmitSimdCmp.cs * Update CpuTestSimdShImm.cs * Update CpuTestSimd.cs * Update CpuTestSimdReg.cs * Nit. * Update SoftFloat.cs * Update InstEmitSimdArithmetic.cs * Update InstEmitSimdHelper.cs * Update CpuTestSimd.cs * Explicit some implicit casts. * Simplify some powers; nits. * Update OpCodeTable.cs * Update InstEmitSimdArithmetic.cs * Update CpuTestSimdReg.cs * Update InstEmitSimdArithmetic.cs --- .../Instructions/InstEmitSimdArithmetic.cs | 268 ++++++- ChocolArm64/Instructions/InstEmitSimdCmp.cs | 296 ++++--- .../Instructions/InstEmitSimdHelper.cs | 110 ++- ChocolArm64/Instructions/InstEmitSimdMove.cs | 80 +- ChocolArm64/Instructions/InstEmitSimdShift.cs | 98 ++- ChocolArm64/Instructions/SoftFloat.cs | 721 ++++++++++++++---- ChocolArm64/OpCodeTable.cs | 1 + ChocolArm64/Optimizations.cs | 2 + Ryujinx.Tests/Cpu/CpuTestSimd.cs | 409 +++++++++- Ryujinx.Tests/Cpu/CpuTestSimdReg.cs | 168 +++- Ryujinx.Tests/Cpu/CpuTestSimdShImm.cs | 96 +++ 11 files changed, 1808 insertions(+), 441 deletions(-) diff --git a/ChocolArm64/Instructions/InstEmitSimdArithmetic.cs b/ChocolArm64/Instructions/InstEmitSimdArithmetic.cs index 013d0432..d1e71ecb 100644 --- a/ChocolArm64/Instructions/InstEmitSimdArithmetic.cs +++ b/ChocolArm64/Instructions/InstEmitSimdArithmetic.cs @@ -176,12 +176,119 @@ namespace ChocolArm64.Instructions public static void Fabd_S(ILEmitterCtx context) { - EmitScalarBinaryOpF(context, () => + if (Optimizations.FastFP && Optimizations.UseSse2) { - context.Emit(OpCodes.Sub); + OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp; - EmitUnaryMathCall(context, nameof(Math.Abs)); - }); + int sizeF = op.Size & 1; + + if (sizeF == 0) + { + Type[] typesSsv = new Type[] { typeof(float) }; + Type[] typesSubAndNot = new Type[] { typeof(Vector128), typeof(Vector128) }; + + context.EmitLdc_R4(-0f); + context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.SetScalarVector128), typesSsv)); + + context.EmitLdvec(op.Rn); + context.EmitLdvec(op.Rm); + + context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.SubtractScalar), typesSubAndNot)); + + context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.AndNot), typesSubAndNot)); + + context.EmitStvec(op.Rd); + + EmitVectorZero32_128(context, op.Rd); + } + else /* if (sizeF == 1) */ + { + Type[] typesSsv = new Type[] { typeof(double) }; + Type[] typesSubAndNot = new Type[] { typeof(Vector128), typeof(Vector128) }; + + context.EmitLdc_R8(-0d); + context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.SetScalarVector128), typesSsv)); + + EmitLdvecWithCastToDouble(context, op.Rn); + EmitLdvecWithCastToDouble(context, op.Rm); + + context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.SubtractScalar), typesSubAndNot)); + + context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.AndNot), typesSubAndNot)); + + EmitStvecWithCastFromDouble(context, op.Rd); + + EmitVectorZeroUpper(context, op.Rd); + } + } + else + { + EmitScalarBinaryOpF(context, () => + { + EmitSoftFloatCall(context, nameof(SoftFloat32.FPSub)); + + EmitUnaryMathCall(context, nameof(Math.Abs)); + }); + } + } + + public static void Fabd_V(ILEmitterCtx context) + { + if (Optimizations.FastFP && Optimizations.UseSse2) + { + OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp; + + int sizeF = op.Size & 1; + + if (sizeF == 0) + { + Type[] typesSav = new Type[] { typeof(float) }; + Type[] typesSubAndNot = new Type[] { typeof(Vector128), typeof(Vector128) }; + + context.EmitLdc_R4(-0f); + context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.SetAllVector128), typesSav)); + + context.EmitLdvec(op.Rn); + context.EmitLdvec(op.Rm); + + context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.Subtract), typesSubAndNot)); + + context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.AndNot), typesSubAndNot)); + + context.EmitStvec(op.Rd); + + if (op.RegisterSize == RegisterSize.Simd64) + { + EmitVectorZeroUpper(context, op.Rd); + } + } + else /* if (sizeF == 1) */ + { + Type[] typesSav = new Type[] { typeof(double) }; + Type[] typesSubAndNot = new Type[] { typeof(Vector128), typeof(Vector128) }; + + context.EmitLdc_R8(-0d); + context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.SetAllVector128), typesSav)); + + EmitLdvecWithCastToDouble(context, op.Rn); + EmitLdvecWithCastToDouble(context, op.Rm); + + context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Subtract), typesSubAndNot)); + + context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.AndNot), typesSubAndNot)); + + EmitStvecWithCastFromDouble(context, op.Rd); + } + } + else + { + EmitVectorBinaryOpF(context, () => + { + EmitSoftFloatCall(context, nameof(SoftFloat32.FPSub)); + + EmitUnaryMathCall(context, nameof(Math.Abs)); + }); + } } public static void Fabs_S(ILEmitterCtx context) @@ -321,17 +428,60 @@ namespace ChocolArm64.Instructions int sizeF = op.Size & 1; - EmitVectorExtractF(context, op.Rn, 0, sizeF); - EmitVectorExtractF(context, op.Rn, 1, sizeF); + if (Optimizations.FastFP && Optimizations.UseSse3) + { + if (sizeF == 0) + { + Type[] typesAddH = new Type[] { typeof(Vector128), typeof(Vector128) }; - context.Emit(OpCodes.Add); + context.EmitLdvec(op.Rn); + context.Emit(OpCodes.Dup); - EmitScalarSetF(context, op.Rd, sizeF); + context.EmitCall(typeof(Sse3).GetMethod(nameof(Sse3.HorizontalAdd), typesAddH)); + + context.EmitStvec(op.Rd); + + EmitVectorZero32_128(context, op.Rd); + } + else /* if (sizeF == 1) */ + { + Type[] typesAddH = new Type[] { typeof(Vector128), typeof(Vector128) }; + + EmitLdvecWithCastToDouble(context, op.Rn); + context.Emit(OpCodes.Dup); + + context.EmitCall(typeof(Sse3).GetMethod(nameof(Sse3.HorizontalAdd), typesAddH)); + + EmitStvecWithCastFromDouble(context, op.Rd); + + EmitVectorZeroUpper(context, op.Rd); + } + } + else + { + EmitVectorExtractF(context, op.Rn, 0, sizeF); + EmitVectorExtractF(context, op.Rn, 1, sizeF); + + EmitSoftFloatCall(context, nameof(SoftFloat32.FPAdd)); + + EmitScalarSetF(context, op.Rd, sizeF); + } } public static void Faddp_V(ILEmitterCtx context) { - EmitVectorPairwiseOpF(context, () => context.Emit(OpCodes.Add)); + if (Optimizations.FastFP && Optimizations.UseSse + && Optimizations.UseSse2) + { + EmitVectorPairwiseSseOrSse2OpF(context, nameof(Sse.Add)); + } + else + { + EmitVectorPairwiseOpF(context, () => + { + EmitSoftFloatCall(context, nameof(SoftFloat32.FPAdd)); + }); + } } public static void Fdiv_S(ILEmitterCtx context) @@ -462,10 +612,18 @@ namespace ChocolArm64.Instructions public static void Fmaxp_V(ILEmitterCtx context) { - EmitVectorPairwiseOpF(context, () => + if (Optimizations.FastFP && Optimizations.UseSse + && Optimizations.UseSse2) { - EmitSoftFloatCall(context, nameof(SoftFloat32.FPMax)); - }); + EmitVectorPairwiseSseOrSse2OpF(context, nameof(Sse.Max)); + } + else + { + EmitVectorPairwiseOpF(context, () => + { + EmitSoftFloatCall(context, nameof(SoftFloat32.FPMax)); + }); + } } public static void Fmin_S(ILEmitterCtx context) @@ -518,10 +676,18 @@ namespace ChocolArm64.Instructions public static void Fminp_V(ILEmitterCtx context) { - EmitVectorPairwiseOpF(context, () => + if (Optimizations.FastFP && Optimizations.UseSse + && Optimizations.UseSse2) { - EmitSoftFloatCall(context, nameof(SoftFloat32.FPMin)); - }); + EmitVectorPairwiseSseOrSse2OpF(context, nameof(Sse.Min)); + } + else + { + EmitVectorPairwiseOpF(context, () => + { + EmitSoftFloatCall(context, nameof(SoftFloat32.FPMin)); + }); + } } public static void Fmla_Se(ILEmitterCtx context) @@ -1085,18 +1251,42 @@ namespace ChocolArm64.Instructions public static void Frecpe_S(ILEmitterCtx context) { - EmitScalarUnaryOpF(context, () => + OpCodeSimd64 op = (OpCodeSimd64)context.CurrOp; + + int sizeF = op.Size & 1; + + if (Optimizations.FastFP && Optimizations.UseSse + && sizeF == 0) { - EmitUnarySoftFloatCall(context, nameof(SoftFloat.RecipEstimate)); - }); + EmitScalarSseOrSse2OpF(context, nameof(Sse.ReciprocalScalar)); + } + else + { + EmitScalarUnaryOpF(context, () => + { + EmitSoftFloatCall(context, nameof(SoftFloat32.FPRecipEstimate)); + }); + } } public static void Frecpe_V(ILEmitterCtx context) { - EmitVectorUnaryOpF(context, () => + OpCodeSimd64 op = (OpCodeSimd64)context.CurrOp; + + int sizeF = op.Size & 1; + + if (Optimizations.FastFP && Optimizations.UseSse + && sizeF == 0) { - EmitUnarySoftFloatCall(context, nameof(SoftFloat.RecipEstimate)); - }); + EmitVectorSseOrSse2OpF(context, nameof(Sse.Reciprocal)); + } + else + { + EmitVectorUnaryOpF(context, () => + { + EmitSoftFloatCall(context, nameof(SoftFloat32.FPRecipEstimate)); + }); + } } public static void Frecps_S(ILEmitterCtx context) // Fused. @@ -1398,18 +1588,42 @@ namespace ChocolArm64.Instructions public static void Frsqrte_S(ILEmitterCtx context) { - EmitScalarUnaryOpF(context, () => + OpCodeSimd64 op = (OpCodeSimd64)context.CurrOp; + + int sizeF = op.Size & 1; + + if (Optimizations.FastFP && Optimizations.UseSse + && sizeF == 0) { - EmitUnarySoftFloatCall(context, nameof(SoftFloat.InvSqrtEstimate)); - }); + EmitScalarSseOrSse2OpF(context, nameof(Sse.ReciprocalSqrtScalar)); + } + else + { + EmitScalarUnaryOpF(context, () => + { + EmitSoftFloatCall(context, nameof(SoftFloat32.FPRSqrtEstimate)); + }); + } } public static void Frsqrte_V(ILEmitterCtx context) { - EmitVectorUnaryOpF(context, () => + OpCodeSimd64 op = (OpCodeSimd64)context.CurrOp; + + int sizeF = op.Size & 1; + + if (Optimizations.FastFP && Optimizations.UseSse + && sizeF == 0) { - EmitUnarySoftFloatCall(context, nameof(SoftFloat.InvSqrtEstimate)); - }); + EmitVectorSseOrSse2OpF(context, nameof(Sse.ReciprocalSqrt)); + } + else + { + EmitVectorUnaryOpF(context, () => + { + EmitSoftFloatCall(context, nameof(SoftFloat32.FPRSqrtEstimate)); + }); + } } public static void Frsqrts_S(ILEmitterCtx context) // Fused. diff --git a/ChocolArm64/Instructions/InstEmitSimdCmp.cs b/ChocolArm64/Instructions/InstEmitSimdCmp.cs index 3ee25482..e1184375 100644 --- a/ChocolArm64/Instructions/InstEmitSimdCmp.cs +++ b/ChocolArm64/Instructions/InstEmitSimdCmp.cs @@ -15,7 +15,7 @@ namespace ChocolArm64.Instructions { public static void Cmeq_S(ILEmitterCtx context) { - EmitCmp(context, OpCodes.Beq_S, scalar: true); + EmitCmpOp(context, OpCodes.Beq_S, scalar: true); } public static void Cmeq_V(ILEmitterCtx context) @@ -32,28 +32,28 @@ namespace ChocolArm64.Instructions } else { - EmitCmp(context, OpCodes.Beq_S, scalar: false); + EmitCmpOp(context, OpCodes.Beq_S, scalar: false); } } else { - EmitCmp(context, OpCodes.Beq_S, scalar: false); + EmitCmpOp(context, OpCodes.Beq_S, scalar: false); } } public static void Cmge_S(ILEmitterCtx context) { - EmitCmp(context, OpCodes.Bge_S, scalar: true); + EmitCmpOp(context, OpCodes.Bge_S, scalar: true); } public static void Cmge_V(ILEmitterCtx context) { - EmitCmp(context, OpCodes.Bge_S, scalar: false); + EmitCmpOp(context, OpCodes.Bge_S, scalar: false); } public static void Cmgt_S(ILEmitterCtx context) { - EmitCmp(context, OpCodes.Bgt_S, scalar: true); + EmitCmpOp(context, OpCodes.Bgt_S, scalar: true); } public static void Cmgt_V(ILEmitterCtx context) @@ -70,63 +70,63 @@ namespace ChocolArm64.Instructions } else { - EmitCmp(context, OpCodes.Bgt_S, scalar: false); + EmitCmpOp(context, OpCodes.Bgt_S, scalar: false); } } else { - EmitCmp(context, OpCodes.Bgt_S, scalar: false); + EmitCmpOp(context, OpCodes.Bgt_S, scalar: false); } } public static void Cmhi_S(ILEmitterCtx context) { - EmitCmp(context, OpCodes.Bgt_Un_S, scalar: true); + EmitCmpOp(context, OpCodes.Bgt_Un_S, scalar: true); } public static void Cmhi_V(ILEmitterCtx context) { - EmitCmp(context, OpCodes.Bgt_Un_S, scalar: false); + EmitCmpOp(context, OpCodes.Bgt_Un_S, scalar: false); } public static void Cmhs_S(ILEmitterCtx context) { - EmitCmp(context, OpCodes.Bge_Un_S, scalar: true); + EmitCmpOp(context, OpCodes.Bge_Un_S, scalar: true); } public static void Cmhs_V(ILEmitterCtx context) { - EmitCmp(context, OpCodes.Bge_Un_S, scalar: false); + EmitCmpOp(context, OpCodes.Bge_Un_S, scalar: false); } public static void Cmle_S(ILEmitterCtx context) { - EmitCmp(context, OpCodes.Ble_S, scalar: true); + EmitCmpOp(context, OpCodes.Ble_S, scalar: true); } public static void Cmle_V(ILEmitterCtx context) { - EmitCmp(context, OpCodes.Ble_S, scalar: false); + EmitCmpOp(context, OpCodes.Ble_S, scalar: false); } public static void Cmlt_S(ILEmitterCtx context) { - EmitCmp(context, OpCodes.Blt_S, scalar: true); + EmitCmpOp(context, OpCodes.Blt_S, scalar: true); } public static void Cmlt_V(ILEmitterCtx context) { - EmitCmp(context, OpCodes.Blt_S, scalar: false); + EmitCmpOp(context, OpCodes.Blt_S, scalar: false); } public static void Cmtst_S(ILEmitterCtx context) { - EmitCmtst(context, scalar: true); + EmitCmtstOp(context, scalar: true); } public static void Cmtst_V(ILEmitterCtx context) { - EmitCmtst(context, scalar: false); + EmitCmtstOp(context, scalar: false); } public static void Fccmp_S(ILEmitterCtx context) @@ -145,7 +145,7 @@ namespace ChocolArm64.Instructions context.MarkLabel(lblTrue); - EmitFcmpE(context, signalNaNs: false); + EmitFcmpOrFcmpe(context, signalNaNs: false); context.MarkLabel(lblEnd); } @@ -166,120 +166,152 @@ namespace ChocolArm64.Instructions context.MarkLabel(lblTrue); - EmitFcmpE(context, signalNaNs: true); + EmitFcmpOrFcmpe(context, signalNaNs: true); context.MarkLabel(lblEnd); } public static void Fcmeq_S(ILEmitterCtx context) { - if (context.CurrOp is OpCodeSimdReg64 && Optimizations.UseSse - && Optimizations.UseSse2) + if (Optimizations.FastFP && Optimizations.UseSse + && Optimizations.UseSse2) { - EmitScalarSseOrSse2OpF(context, nameof(Sse.CompareEqualScalar)); + EmitCmpSseOrSse2OpF(context, nameof(Sse.CompareEqualScalar), scalar: true); } else { - EmitScalarFcmp(context, OpCodes.Beq_S); + EmitCmpOpF(context, nameof(SoftFloat32.FPCompareEQ), scalar: true); } } public static void Fcmeq_V(ILEmitterCtx context) { - if (context.CurrOp is OpCodeSimdReg64 && Optimizations.UseSse - && Optimizations.UseSse2) + if (Optimizations.FastFP && Optimizations.UseSse + && Optimizations.UseSse2) { - EmitVectorSseOrSse2OpF(context, nameof(Sse.CompareEqual)); + EmitCmpSseOrSse2OpF(context, nameof(Sse.CompareEqual), scalar: false); } else { - EmitVectorFcmp(context, OpCodes.Beq_S); + EmitCmpOpF(context, nameof(SoftFloat32.FPCompareEQ), scalar: false); } } public static void Fcmge_S(ILEmitterCtx context) { - if (context.CurrOp is OpCodeSimdReg64 && Optimizations.UseSse - && Optimizations.UseSse2) + if (Optimizations.FastFP && Optimizations.UseSse + && Optimizations.UseSse2) { - EmitScalarSseOrSse2OpF(context, nameof(Sse.CompareGreaterThanOrEqualScalar)); + EmitCmpSseOrSse2OpF(context, nameof(Sse.CompareGreaterThanOrEqualScalar), scalar: true); } else { - EmitScalarFcmp(context, OpCodes.Bge_S); + EmitCmpOpF(context, nameof(SoftFloat32.FPCompareGE), scalar: true); } } public static void Fcmge_V(ILEmitterCtx context) { - if (context.CurrOp is OpCodeSimdReg64 && Optimizations.UseSse - && Optimizations.UseSse2) + if (Optimizations.FastFP && Optimizations.UseSse + && Optimizations.UseSse2) { - EmitVectorSseOrSse2OpF(context, nameof(Sse.CompareGreaterThanOrEqual)); + EmitCmpSseOrSse2OpF(context, nameof(Sse.CompareGreaterThanOrEqual), scalar: false); } else { - EmitVectorFcmp(context, OpCodes.Bge_S); + EmitCmpOpF(context, nameof(SoftFloat32.FPCompareGE), scalar: false); } } public static void Fcmgt_S(ILEmitterCtx context) { - if (context.CurrOp is OpCodeSimdReg64 && Optimizations.UseSse - && Optimizations.UseSse2) + if (Optimizations.FastFP && Optimizations.UseSse + && Optimizations.UseSse2) { - EmitScalarSseOrSse2OpF(context, nameof(Sse.CompareGreaterThanScalar)); + EmitCmpSseOrSse2OpF(context, nameof(Sse.CompareGreaterThanScalar), scalar: true); } else { - EmitScalarFcmp(context, OpCodes.Bgt_S); + EmitCmpOpF(context, nameof(SoftFloat32.FPCompareGT), scalar: true); } } public static void Fcmgt_V(ILEmitterCtx context) { - if (context.CurrOp is OpCodeSimdReg64 && Optimizations.UseSse - && Optimizations.UseSse2) + if (Optimizations.FastFP && Optimizations.UseSse + && Optimizations.UseSse2) { - EmitVectorSseOrSse2OpF(context, nameof(Sse.CompareGreaterThan)); + EmitCmpSseOrSse2OpF(context, nameof(Sse.CompareGreaterThan), scalar: false); } else { - EmitVectorFcmp(context, OpCodes.Bgt_S); + EmitCmpOpF(context, nameof(SoftFloat32.FPCompareGT), scalar: false); } } public static void Fcmle_S(ILEmitterCtx context) { - EmitScalarFcmp(context, OpCodes.Ble_S); + if (Optimizations.FastFP && Optimizations.UseSse + && Optimizations.UseSse2) + { + EmitCmpSseOrSse2OpF(context, nameof(Sse.CompareGreaterThanOrEqualScalar), scalar: true, isLeOrLt: true); + } + else + { + EmitCmpOpF(context, nameof(SoftFloat32.FPCompareLE), scalar: true); + } } public static void Fcmle_V(ILEmitterCtx context) { - EmitVectorFcmp(context, OpCodes.Ble_S); + if (Optimizations.FastFP && Optimizations.UseSse + && Optimizations.UseSse2) + { + EmitCmpSseOrSse2OpF(context, nameof(Sse.CompareGreaterThanOrEqual), scalar: false, isLeOrLt: true); + } + else + { + EmitCmpOpF(context, nameof(SoftFloat32.FPCompareLE), scalar: false); + } } public static void Fcmlt_S(ILEmitterCtx context) { - EmitScalarFcmp(context, OpCodes.Blt_S); + if (Optimizations.FastFP && Optimizations.UseSse + && Optimizations.UseSse2) + { + EmitCmpSseOrSse2OpF(context, nameof(Sse.CompareGreaterThanScalar), scalar: true, isLeOrLt: true); + } + else + { + EmitCmpOpF(context, nameof(SoftFloat32.FPCompareLT), scalar: true); + } } public static void Fcmlt_V(ILEmitterCtx context) { - EmitVectorFcmp(context, OpCodes.Blt_S); + if (Optimizations.FastFP && Optimizations.UseSse + && Optimizations.UseSse2) + { + EmitCmpSseOrSse2OpF(context, nameof(Sse.CompareGreaterThan), scalar: false, isLeOrLt: true); + } + else + { + EmitCmpOpF(context, nameof(SoftFloat32.FPCompareLT), scalar: false); + } } public static void Fcmp_S(ILEmitterCtx context) { - EmitFcmpE(context, signalNaNs: false); + EmitFcmpOrFcmpe(context, signalNaNs: false); } public static void Fcmpe_S(ILEmitterCtx context) { - EmitFcmpE(context, signalNaNs: true); + EmitFcmpOrFcmpe(context, signalNaNs: true); } - private static void EmitFcmpE(ILEmitterCtx context, bool signalNaNs) + private static void EmitFcmpOrFcmpe(ILEmitterCtx context, bool signalNaNs) { OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp; @@ -430,7 +462,7 @@ namespace ChocolArm64.Instructions { context.EmitLdc_R4(0f); } - else // if (op.Size == 1) + else /* if (op.Size == 1) */ { context.EmitLdc_R8(0d); } @@ -448,7 +480,7 @@ namespace ChocolArm64.Instructions } } - private static void EmitCmp(ILEmitterCtx context, OpCode ilOp, bool scalar) + private static void EmitCmpOp(ILEmitterCtx context, OpCode ilOp, bool scalar) { OpCodeSimd64 op = (OpCodeSimd64)context.CurrOp; @@ -492,7 +524,7 @@ namespace ChocolArm64.Instructions } } - private static void EmitCmtst(ILEmitterCtx context, bool scalar) + private static void EmitCmtstOp(ILEmitterCtx context, bool scalar) { OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp; @@ -532,84 +564,134 @@ namespace ChocolArm64.Instructions } } - private static void EmitScalarFcmp(ILEmitterCtx context, OpCode ilOp) - { - EmitFcmp(context, ilOp, 0, scalar: true); - } - - private static void EmitVectorFcmp(ILEmitterCtx context, OpCode ilOp) + private static void EmitCmpOpF(ILEmitterCtx context, string name, bool scalar) { OpCodeSimd64 op = (OpCodeSimd64)context.CurrOp; int sizeF = op.Size & 1; int bytes = op.GetBitsCount() >> 3; - int elems = bytes >> sizeF + 2; + int elems = !scalar ? bytes >> sizeF + 2 : 1; for (int index = 0; index < elems; index++) { - EmitFcmp(context, ilOp, index, scalar: false); + EmitVectorExtractF(context, op.Rn, index, sizeF); + + if (op is OpCodeSimdReg64 binOp) + { + EmitVectorExtractF(context, binOp.Rm, index, sizeF); + } + else + { + if (sizeF == 0) + { + context.EmitLdc_R4(0f); + } + else /* if (sizeF == 1) */ + { + context.EmitLdc_R8(0d); + } + } + + EmitSoftFloatCall(context, name); + + EmitVectorInsertF(context, op.Rd, index, sizeF); } - if (op.RegisterSize == RegisterSize.Simd64) + if (!scalar) { - EmitVectorZeroUpper(context, op.Rd); + if (op.RegisterSize == RegisterSize.Simd64) + { + EmitVectorZeroUpper(context, op.Rd); + } + } + else + { + if (sizeF == 0) + { + EmitVectorZero32_128(context, op.Rd); + } + else /* if (sizeF == 1) */ + { + EmitVectorZeroUpper(context, op.Rd); + } } } - private static void EmitFcmp(ILEmitterCtx context, OpCode ilOp, int index, bool scalar) + private static void EmitCmpSseOrSse2OpF(ILEmitterCtx context, string name, bool scalar, bool isLeOrLt = false) { OpCodeSimd64 op = (OpCodeSimd64)context.CurrOp; int sizeF = op.Size & 1; - ulong szMask = ulong.MaxValue >> (64 - (32 << sizeF)); - - EmitVectorExtractF(context, op.Rn, index, sizeF); - - if (op is OpCodeSimdReg64 binOp) + if (sizeF == 0) { - EmitVectorExtractF(context, binOp.Rm, index, sizeF); - } - else if (sizeF == 0) - { - context.EmitLdc_R4(0f); + Type[] types = new Type[] { typeof(Vector128), typeof(Vector128) }; + + if (!isLeOrLt) + { + context.EmitLdvec(op.Rn); + } + + if (op is OpCodeSimdReg64 binOp) + { + context.EmitLdvec(binOp.Rm); + } + else + { + VectorHelper.EmitCall(context, nameof(VectorHelper.VectorSingleZero)); + } + + if (isLeOrLt) + { + context.EmitLdvec(op.Rn); + } + + context.EmitCall(typeof(Sse).GetMethod(name, types)); + + context.EmitStvec(op.Rd); + + if (scalar) + { + EmitVectorZero32_128(context, op.Rd); + } + else if (op.RegisterSize == RegisterSize.Simd64) + { + EmitVectorZeroUpper(context, op.Rd); + } } else /* if (sizeF == 1) */ { - context.EmitLdc_R8(0d); + Type[] types = new Type[] { typeof(Vector128), typeof(Vector128) }; + + if (!isLeOrLt) + { + EmitLdvecWithCastToDouble(context, op.Rn); + } + + if (op is OpCodeSimdReg64 binOp) + { + EmitLdvecWithCastToDouble(context, binOp.Rm); + } + else + { + VectorHelper.EmitCall(context, nameof(VectorHelper.VectorDoubleZero)); + } + + if (isLeOrLt) + { + EmitLdvecWithCastToDouble(context, op.Rn); + } + + context.EmitCall(typeof(Sse2).GetMethod(name, types)); + + EmitStvecWithCastFromDouble(context, op.Rd); + + if (scalar) + { + EmitVectorZeroUpper(context, op.Rd); + } } - - ILLabel lblTrue = new ILLabel(); - ILLabel lblEnd = new ILLabel(); - - context.Emit(ilOp, lblTrue); - - if (scalar) - { - EmitVectorZeroAll(context, op.Rd); - } - else - { - EmitVectorInsert(context, op.Rd, index, sizeF + 2, 0); - } - - context.Emit(OpCodes.Br_S, lblEnd); - - context.MarkLabel(lblTrue); - - if (scalar) - { - EmitVectorInsert(context, op.Rd, index, 3, (long)szMask); - - EmitVectorZeroUpper(context, op.Rd); - } - else - { - EmitVectorInsert(context, op.Rd, index, sizeF + 2, (long)szMask); - } - - context.MarkLabel(lblEnd); } } } diff --git a/ChocolArm64/Instructions/InstEmitSimdHelper.cs b/ChocolArm64/Instructions/InstEmitSimdHelper.cs index 7b597be3..cea481a6 100644 --- a/ChocolArm64/Instructions/InstEmitSimdHelper.cs +++ b/ChocolArm64/Instructions/InstEmitSimdHelper.cs @@ -322,26 +322,6 @@ namespace ChocolArm64.Instructions context.EmitCall(mthdInfo); } - public static void EmitUnarySoftFloatCall(ILEmitterCtx context, string name) - { - IOpCodeSimd64 op = (IOpCodeSimd64)context.CurrOp; - - int sizeF = op.Size & 1; - - MethodInfo mthdInfo; - - if (sizeF == 0) - { - mthdInfo = typeof(SoftFloat).GetMethod(name, new Type[] { typeof(float) }); - } - else /* if (sizeF == 1) */ - { - mthdInfo = typeof(SoftFloat).GetMethod(name, new Type[] { typeof(double) }); - } - - context.EmitCall(mthdInfo); - } - public static void EmitSoftFloatCall(ILEmitterCtx context, string name) { IOpCodeSimd64 op = (IOpCodeSimd64)context.CurrOp; @@ -909,6 +889,96 @@ namespace ChocolArm64.Instructions } } + public static void EmitVectorPairwiseSseOrSse2OpF(ILEmitterCtx context, string name) + { + OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp; + + int sizeF = op.Size & 1; + + if (sizeF == 0) + { + if (op.RegisterSize == RegisterSize.Simd64) + { + Type[] types = new Type[] { typeof(Vector128), typeof(Vector128) }; + + context.EmitLdvec(op.Rn); + context.EmitLdvec(op.Rm); + + context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.UnpackLow), types)); + + context.Emit(OpCodes.Dup); + context.EmitStvectmp(); + + VectorHelper.EmitCall(context, nameof(VectorHelper.VectorSingleZero)); + + context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.MoveLowToHigh), types)); + + VectorHelper.EmitCall(context, nameof(VectorHelper.VectorSingleZero)); + + context.EmitLdvectmp(); + + context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.MoveHighToLow), types)); + + context.EmitCall(typeof(Sse).GetMethod(name, types)); + + context.EmitStvec(op.Rd); + } + else /* if (op.RegisterSize == RegisterSize.Simd128) */ + { + Type[] typesSfl = new Type[] { typeof(Vector128), typeof(Vector128), typeof(byte) }; + Type[] types = new Type[] { typeof(Vector128), typeof(Vector128) }; + + context.EmitLdvec(op.Rn); + + context.Emit(OpCodes.Dup); + context.EmitStvectmp(); + + context.EmitLdvec(op.Rm); + + context.Emit(OpCodes.Dup); + context.EmitStvectmp2(); + + context.EmitLdc_I4(2 << 6 | 0 << 4 | 2 << 2 | 0 << 0); + context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.Shuffle), typesSfl)); + + context.EmitLdvectmp(); + context.EmitLdvectmp2(); + + context.EmitLdc_I4(3 << 6 | 1 << 4 | 3 << 2 | 1 << 0); + context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.Shuffle), typesSfl)); + + context.EmitCall(typeof(Sse).GetMethod(name, types)); + + context.EmitStvec(op.Rd); + } + } + else /* if (sizeF == 1) */ + { + Type[] types = new Type[] { typeof(Vector128), typeof(Vector128) }; + + EmitLdvecWithCastToDouble(context, op.Rn); + + context.Emit(OpCodes.Dup); + context.EmitStvectmp(); + + EmitLdvecWithCastToDouble(context, op.Rm); + + context.Emit(OpCodes.Dup); + context.EmitStvectmp2(); + + context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.UnpackLow), types)); + + context.EmitLdvectmp(); + context.EmitLdvectmp2(); + + context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.UnpackHigh), types)); + + context.EmitCall(typeof(Sse2).GetMethod(name, types)); + + EmitStvecWithCastFromDouble(context, op.Rd); + } + } + [Flags] public enum SaturatingFlags { diff --git a/ChocolArm64/Instructions/InstEmitSimdMove.cs b/ChocolArm64/Instructions/InstEmitSimdMove.cs index 0d9aa312..d40ccff9 100644 --- a/ChocolArm64/Instructions/InstEmitSimdMove.cs +++ b/ChocolArm64/Instructions/InstEmitSimdMove.cs @@ -377,75 +377,47 @@ namespace ChocolArm64.Instructions { OpCodeSimd64 op = (OpCodeSimd64)context.CurrOp; - int elems = 8 >> op.Size; - - int part = op.RegisterSize == RegisterSize.Simd128 ? elems : 0; - - if (Optimizations.UseSse41 && op.Size < 2) + if (Optimizations.UseSsse3) { - void EmitZeroVector() + long[] masks = new long[] { - switch (op.Size) - { - case 0: VectorHelper.EmitCall(context, nameof(VectorHelper.VectorInt16Zero)); break; - case 1: VectorHelper.EmitCall(context, nameof(VectorHelper.VectorInt32Zero)); break; - } - } + 14L << 56 | 12L << 48 | 10L << 40 | 08L << 32 | 06L << 24 | 04L << 16 | 02L << 8 | 00L << 0, + 13L << 56 | 12L << 48 | 09L << 40 | 08L << 32 | 05L << 24 | 04L << 16 | 01L << 8 | 00L << 0, + 11L << 56 | 10L << 48 | 09L << 40 | 08L << 32 | 03L << 24 | 02L << 16 | 01L << 8 | 00L << 0 + }; - //For XTN, first operand is source, second operand is 0. - //For XTN2, first operand is 0, second operand is source. - if (part != 0) - { - EmitZeroVector(); - } + Type[] typesMov = new Type[] { typeof(Vector128), typeof(Vector128) }; + Type[] typesSfl = new Type[] { typeof(Vector128), typeof(Vector128) }; + Type[] typesSve = new Type[] { typeof(long), typeof(long) }; - EmitLdvecWithSignedCast(context, op.Rn, op.Size + 1); + string nameMov = op.RegisterSize == RegisterSize.Simd128 + ? nameof(Sse.MoveLowToHigh) + : nameof(Sse.MoveHighToLow); - //Set mask to discard the upper half of the wide elements. - switch (op.Size) - { - case 0: context.EmitLdc_I4(0x00ff); break; - case 1: context.EmitLdc_I4(0x0000ffff); break; - } + context.EmitLdvec(op.Rd); + VectorHelper.EmitCall(context, nameof(VectorHelper.VectorSingleZero)); - Type wideType = IntTypesPerSizeLog2[op.Size + 1]; + context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.MoveLowToHigh), typesMov)); - context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.SetAllVector128), new Type[] { wideType })); + EmitLdvecWithSignedCast(context, op.Rn, 0); - wideType = VectorIntTypesPerSizeLog2[op.Size + 1]; + context.EmitLdc_I8(masks[op.Size]); + context.Emit(OpCodes.Dup); - Type[] wideTypes = new Type[] { wideType, wideType }; + context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.SetVector128), typesSve)); - context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.And), wideTypes)); + context.EmitCall(typeof(Ssse3).GetMethod(nameof(Ssse3.Shuffle), typesSfl)); - if (part == 0) - { - EmitZeroVector(); - } + context.EmitCall(typeof(Sse).GetMethod(nameMov, typesMov)); - //Pack values with signed saturation, the signed saturation shouldn't - //saturate anything since the upper bits were masked off. - Type sseType = op.Size == 0 ? typeof(Sse2) : typeof(Sse41); - - context.EmitCall(sseType.GetMethod(nameof(Sse2.PackUnsignedSaturate), wideTypes)); - - if (part != 0) - { - //For XTN2, we additionally need to discard the upper bits - //of the target register and OR the result with it. - EmitVectorZeroUpper(context, op.Rd); - - EmitLdvecWithUnsignedCast(context, op.Rd, op.Size); - - Type narrowType = VectorUIntTypesPerSizeLog2[op.Size]; - - context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Or), new Type[] { narrowType, narrowType })); - } - - EmitStvecWithUnsignedCast(context, op.Rd, op.Size); + context.EmitStvec(op.Rd); } else { + int elems = 8 >> op.Size; + + int part = op.RegisterSize == RegisterSize.Simd128 ? elems : 0; + if (part != 0) { context.EmitLdvec(op.Rd); diff --git a/ChocolArm64/Instructions/InstEmitSimdShift.cs b/ChocolArm64/Instructions/InstEmitSimdShift.cs index 5b606167..84305211 100644 --- a/ChocolArm64/Instructions/InstEmitSimdShift.cs +++ b/ChocolArm64/Instructions/InstEmitSimdShift.cs @@ -22,9 +22,11 @@ namespace ChocolArm64.Instructions { OpCodeSimdShImm64 op = (OpCodeSimdShImm64)context.CurrOp; + int shift = GetImmShl(op); + EmitScalarUnaryOpZx(context, () => { - context.EmitLdc_I4(GetImmShl(op)); + context.EmitLdc_I4(shift); context.Emit(OpCodes.Shl); }); @@ -34,13 +36,15 @@ namespace ChocolArm64.Instructions { OpCodeSimdShImm64 op = (OpCodeSimdShImm64)context.CurrOp; + int shift = GetImmShl(op); + if (Optimizations.UseSse2 && op.Size > 0) { Type[] typesSll = new Type[] { VectorUIntTypesPerSizeLog2[op.Size], typeof(byte) }; EmitLdvecWithUnsignedCast(context, op.Rn, op.Size); - context.EmitLdc_I4(GetImmShl(op)); + context.EmitLdc_I4(shift); context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftLeftLogical), typesSll)); EmitStvecWithUnsignedCast(context, op.Rd, op.Size); @@ -54,7 +58,7 @@ namespace ChocolArm64.Instructions { EmitVectorUnaryOpZx(context, () => { - context.EmitLdc_I4(GetImmShl(op)); + context.EmitLdc_I4(shift); context.Emit(OpCodes.Shl); }); @@ -67,7 +71,33 @@ namespace ChocolArm64.Instructions int shift = 8 << op.Size; - EmitVectorShImmWidenBinaryZx(context, () => context.Emit(OpCodes.Shl), shift); + if (Optimizations.UseSse41) + { + Type[] typesSll = new Type[] { VectorUIntTypesPerSizeLog2[op.Size + 1], typeof(byte) }; + Type[] typesCvt = new Type[] { VectorUIntTypesPerSizeLog2[op.Size] }; + + string[] namesCvt = new string[] { nameof(Sse41.ConvertToVector128Int16), + nameof(Sse41.ConvertToVector128Int32), + nameof(Sse41.ConvertToVector128Int64) }; + + int numBytes = op.RegisterSize == RegisterSize.Simd128 ? 8 : 0; + + EmitLdvecWithUnsignedCast(context, op.Rn, op.Size); + + context.EmitLdc_I4(numBytes); + context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical128BitLane), typesSll)); + + context.EmitCall(typeof(Sse41).GetMethod(namesCvt[op.Size], typesCvt)); + + context.EmitLdc_I4(shift); + context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftLeftLogical), typesSll)); + + EmitStvecWithUnsignedCast(context, op.Rd, op.Size + 1); + } + else + { + EmitVectorShImmWidenBinaryZx(context, () => context.Emit(OpCodes.Shl), shift); + } } public static void Shrn_V(ILEmitterCtx context) @@ -362,7 +392,35 @@ namespace ChocolArm64.Instructions { OpCodeSimdShImm64 op = (OpCodeSimdShImm64)context.CurrOp; - EmitVectorShImmWidenBinarySx(context, () => context.Emit(OpCodes.Shl), GetImmShl(op)); + int shift = GetImmShl(op); + + if (Optimizations.UseSse41) + { + Type[] typesSll = new Type[] { VectorIntTypesPerSizeLog2[op.Size + 1], typeof(byte) }; + Type[] typesCvt = new Type[] { VectorIntTypesPerSizeLog2[op.Size] }; + + string[] namesCvt = new string[] { nameof(Sse41.ConvertToVector128Int16), + nameof(Sse41.ConvertToVector128Int32), + nameof(Sse41.ConvertToVector128Int64) }; + + int numBytes = op.RegisterSize == RegisterSize.Simd128 ? 8 : 0; + + EmitLdvecWithSignedCast(context, op.Rn, op.Size); + + context.EmitLdc_I4(numBytes); + context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical128BitLane), typesSll)); + + context.EmitCall(typeof(Sse41).GetMethod(namesCvt[op.Size], typesCvt)); + + context.EmitLdc_I4(shift); + context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftLeftLogical), typesSll)); + + EmitStvecWithSignedCast(context, op.Rd, op.Size + 1); + } + else + { + EmitVectorShImmWidenBinarySx(context, () => context.Emit(OpCodes.Shl), shift); + } } public static void Sshr_S(ILEmitterCtx context) @@ -663,7 +721,35 @@ namespace ChocolArm64.Instructions { OpCodeSimdShImm64 op = (OpCodeSimdShImm64)context.CurrOp; - EmitVectorShImmWidenBinaryZx(context, () => context.Emit(OpCodes.Shl), GetImmShl(op)); + int shift = GetImmShl(op); + + if (Optimizations.UseSse41) + { + Type[] typesSll = new Type[] { VectorUIntTypesPerSizeLog2[op.Size + 1], typeof(byte) }; + Type[] typesCvt = new Type[] { VectorUIntTypesPerSizeLog2[op.Size] }; + + string[] namesCvt = new string[] { nameof(Sse41.ConvertToVector128Int16), + nameof(Sse41.ConvertToVector128Int32), + nameof(Sse41.ConvertToVector128Int64) }; + + int numBytes = op.RegisterSize == RegisterSize.Simd128 ? 8 : 0; + + EmitLdvecWithUnsignedCast(context, op.Rn, op.Size); + + context.EmitLdc_I4(numBytes); + context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical128BitLane), typesSll)); + + context.EmitCall(typeof(Sse41).GetMethod(namesCvt[op.Size], typesCvt)); + + context.EmitLdc_I4(shift); + context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftLeftLogical), typesSll)); + + EmitStvecWithUnsignedCast(context, op.Rd, op.Size + 1); + } + else + { + EmitVectorShImmWidenBinaryZx(context, () => context.Emit(OpCodes.Shl), shift); + } } public static void Ushr_S(ILEmitterCtx context) diff --git a/ChocolArm64/Instructions/SoftFloat.cs b/ChocolArm64/Instructions/SoftFloat.cs index 2af8afbd..39d279de 100644 --- a/ChocolArm64/Instructions/SoftFloat.cs +++ b/ChocolArm64/Instructions/SoftFloat.cs @@ -9,191 +9,72 @@ namespace ChocolArm64.Instructions { static SoftFloat() { - RecipEstimateTable = BuildRecipEstimateTable(); - InvSqrtEstimateTable = BuildInvSqrtEstimateTable(); + RecipEstimateTable = BuildRecipEstimateTable(); + RecipSqrtEstimateTable = BuildRecipSqrtEstimateTable(); } - private static readonly byte[] RecipEstimateTable; - private static readonly byte[] InvSqrtEstimateTable; + internal static readonly byte[] RecipEstimateTable; + internal static readonly byte[] RecipSqrtEstimateTable; private static byte[] BuildRecipEstimateTable() { - byte[] table = new byte[256]; - for (ulong index = 0; index < 256; index++) + byte[] tbl = new byte[256]; + + for (int idx = 0; idx < 256; idx++) { - ulong a = index | 0x100; + uint src = (uint)idx + 256u; - a = (a << 1) + 1; - ulong b = 0x80000 / a; - b = (b + 1) >> 1; + Debug.Assert(256u <= src && src < 512u); - table[index] = (byte)(b & 0xFF); + src = (src << 1) + 1u; + + uint aux = (1u << 19) / src; + + uint dst = (aux + 1u) >> 1; + + Debug.Assert(256u <= dst && dst < 512u); + + tbl[idx] = (byte)(dst - 256u); } - return table; + + return tbl; } - private static byte[] BuildInvSqrtEstimateTable() + private static byte[] BuildRecipSqrtEstimateTable() { - byte[] table = new byte[512]; - for (ulong index = 128; index < 512; index++) + byte[] tbl = new byte[384]; + + for (int idx = 0; idx < 384; idx++) { - ulong a = index; - if (a < 256) + uint src = (uint)idx + 128u; + + Debug.Assert(128u <= src && src < 512u); + + if (src < 256u) { - a = (a << 1) + 1; + src = (src << 1) + 1u; } else { - a = (a | 1) << 1; + src = (src >> 1) << 1; + src = (src + 1u) << 1; } - ulong b = 256; - while (a * (b + 1) * (b + 1) < (1ul << 28)) + uint aux = 512u; + + while (src * (aux + 1u) * (aux + 1u) < (1u << 28)) { - b++; - } - b = (b + 1) >> 1; - - table[index] = (byte)(b & 0xFF); - } - return table; - } - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static float RecipEstimate(float x) - { - return (float)RecipEstimate((double)x); - } - - public static double RecipEstimate(double x) - { - ulong xBits = (ulong)BitConverter.DoubleToInt64Bits(x); - ulong xSign = xBits & 0x8000000000000000; - ulong xExp = (xBits >> 52) & 0x7FF; - ulong scaled = xBits & ((1ul << 52) - 1); - - if (xExp >= 2045) - { - if (xExp == 0x7ff && scaled != 0) - { - // NaN - return BitConverter.Int64BitsToDouble((long)(xBits | 0x0008000000000000)); + aux = aux + 1u; } - // Infinity, or Out of range -> Zero - return BitConverter.Int64BitsToDouble((long)xSign); + uint dst = (aux + 1u) >> 1; + + Debug.Assert(256u <= dst && dst < 512u); + + tbl[idx] = (byte)(dst - 256u); } - if (xExp == 0) - { - if (scaled == 0) - { - // Zero -> Infinity - return BitConverter.Int64BitsToDouble((long)(xSign | 0x7FF0000000000000)); - } - - // Denormal - if ((scaled & (1ul << 51)) == 0) - { - xExp = ~0ul; - scaled <<= 2; - } - else - { - scaled <<= 1; - } - } - - scaled >>= 44; - scaled &= 0xFF; - - ulong resultExp = (2045 - xExp) & 0x7FF; - ulong estimate = (ulong)RecipEstimateTable[scaled]; - ulong fraction = estimate << 44; - - if (resultExp == 0) - { - fraction >>= 1; - fraction |= 1ul << 51; - } - else if (resultExp == 0x7FF) - { - resultExp = 0; - fraction >>= 2; - fraction |= 1ul << 50; - } - - ulong result = xSign | (resultExp << 52) | fraction; - return BitConverter.Int64BitsToDouble((long)result); - } - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static float InvSqrtEstimate(float x) - { - return (float)InvSqrtEstimate((double)x); - } - - public static double InvSqrtEstimate(double x) - { - ulong xBits = (ulong)BitConverter.DoubleToInt64Bits(x); - ulong xSign = xBits & 0x8000000000000000; - long xExp = (long)((xBits >> 52) & 0x7FF); - ulong scaled = xBits & ((1ul << 52) - 1); - - if (xExp == 0x7FF && scaled != 0) - { - // NaN - return BitConverter.Int64BitsToDouble((long)(xBits | 0x0008000000000000)); - } - - if (xExp == 0) - { - if (scaled == 0) - { - // Zero -> Infinity - return BitConverter.Int64BitsToDouble((long)(xSign | 0x7FF0000000000000)); - } - - // Denormal - while ((scaled & (1 << 51)) == 0) - { - scaled <<= 1; - xExp--; - } - scaled <<= 1; - } - - if (xSign != 0) - { - // Negative -> NaN - return BitConverter.Int64BitsToDouble((long)0x7FF8000000000000); - } - - if (xExp == 0x7ff && scaled == 0) - { - // Infinity -> Zero - return BitConverter.Int64BitsToDouble((long)xSign); - } - - if (((ulong)xExp & 1) == 1) - { - scaled >>= 45; - scaled &= 0xFF; - scaled |= 0x80; - } - else - { - scaled >>= 44; - scaled &= 0xFF; - scaled |= 0x100; - } - - ulong resultExp = ((ulong)(3068 - xExp) / 2) & 0x7FF; - ulong estimate = (ulong)InvSqrtEstimateTable[scaled]; - ulong fraction = estimate << 44; - - ulong result = xSign | (resultExp << 52) | fraction; - return BitConverter.Int64BitsToDouble((long)result); + return tbl; } } @@ -395,12 +276,12 @@ namespace ChocolArm64.Instructions { intMant++; - if (intMant == (uint)Math.Pow(2d, f)) + if (intMant == 1u << f) { biasedExp = 1u; } - if (intMant == (uint)Math.Pow(2d, f + 1)) + if (intMant == 1u << (f + 1)) { biasedExp++; intMant >>= 1; @@ -409,7 +290,7 @@ namespace ChocolArm64.Instructions float result; - if (biasedExp >= (uint)Math.Pow(2d, e) - 1u) + if (biasedExp >= (1u << e) - 1u) { result = overflowToInf ? FPInfinity(sign) : FPMaxNormal(sign); @@ -666,12 +547,12 @@ namespace ChocolArm64.Instructions { intMant++; - if (intMant == (uint)Math.Pow(2d, f)) + if (intMant == 1u << f) { biasedExp = 1u; } - if (intMant == (uint)Math.Pow(2d, f + 1)) + if (intMant == 1u << (f + 1)) { biasedExp++; intMant >>= 1; @@ -682,7 +563,7 @@ namespace ChocolArm64.Instructions if (!state.GetFpcrFlag(Fpcr.Ahp)) { - if (biasedExp >= (uint)Math.Pow(2d, e) - 1u) + if (biasedExp >= (1u << e) - 1u) { resultBits = overflowToInf ? FPInfinity(sign) : FPMaxNormal(sign); @@ -697,7 +578,7 @@ namespace ChocolArm64.Instructions } else { - if (biasedExp >= (uint)Math.Pow(2d, e)) + if (biasedExp >= 1u << e) { resultBits = (ushort)((sign ? 1u : 0u) << 15 | 0x7FFFu); @@ -826,6 +707,94 @@ namespace ChocolArm64.Instructions return result; } + public static float FPCompareEQ(float value1, float value2, CpuThreadState state) + { + Debug.WriteLineIf(state.Fpcr != 0, $"SoftFloat32.FPCompareEQ: state.Fpcr = 0x{state.Fpcr:X8}"); + + value1 = value1.FPUnpack(out FpType type1, out _, out _, state); + value2 = value2.FPUnpack(out FpType type2, out _, out _, state); + + float result; + + if (type1 == FpType.SNaN || type1 == FpType.QNaN || type2 == FpType.SNaN || type2 == FpType.QNaN) + { + result = ZerosOrOnes(false); + + if (type1 == FpType.SNaN || type2 == FpType.SNaN) + { + FPProcessException(FpExc.InvalidOp, state); + } + } + else + { + result = ZerosOrOnes(value1 == value2); + } + + return result; + } + + public static float FPCompareGE(float value1, float value2, CpuThreadState state) + { + Debug.WriteLineIf(state.Fpcr != 0, $"SoftFloat32.FPCompareGE: state.Fpcr = 0x{state.Fpcr:X8}"); + + value1 = value1.FPUnpack(out FpType type1, out _, out _, state); + value2 = value2.FPUnpack(out FpType type2, out _, out _, state); + + float result; + + if (type1 == FpType.SNaN || type1 == FpType.QNaN || type2 == FpType.SNaN || type2 == FpType.QNaN) + { + result = ZerosOrOnes(false); + + FPProcessException(FpExc.InvalidOp, state); + } + else + { + result = ZerosOrOnes(value1 >= value2); + } + + return result; + } + + public static float FPCompareGT(float value1, float value2, CpuThreadState state) + { + Debug.WriteLineIf(state.Fpcr != 0, $"SoftFloat32.FPCompareGT: state.Fpcr = 0x{state.Fpcr:X8}"); + + value1 = value1.FPUnpack(out FpType type1, out _, out _, state); + value2 = value2.FPUnpack(out FpType type2, out _, out _, state); + + float result; + + if (type1 == FpType.SNaN || type1 == FpType.QNaN || type2 == FpType.SNaN || type2 == FpType.QNaN) + { + result = ZerosOrOnes(false); + + FPProcessException(FpExc.InvalidOp, state); + } + else + { + result = ZerosOrOnes(value1 > value2); + } + + return result; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static float FPCompareLE(float value1, float value2, CpuThreadState state) + { + Debug.WriteLineIf(state.Fpcr != 0, $"SoftFloat32.FPCompareLE: state.Fpcr = 0x{state.Fpcr:X8}"); + + return FPCompareGE(value2, value1, state); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static float FPCompareLT(float value1, float value2, CpuThreadState state) + { + Debug.WriteLineIf(state.Fpcr != 0, $"SoftFloat32.FPCompareLT: state.Fpcr = 0x{state.Fpcr:X8}"); + + return FPCompareGT(value2, value1, state); + } + public static float FPDiv(float value1, float value2, CpuThreadState state) { Debug.WriteLineIf(state.Fpcr != 0, $"SoftFloat32.FPDiv: state.Fpcr = 0x{state.Fpcr:X8}"); @@ -1188,6 +1157,95 @@ namespace ChocolArm64.Instructions return result; } + public static float FPRecipEstimate(float value, CpuThreadState state) + { + Debug.WriteLineIf(state.Fpcr != 0, $"SoftFloat32.FPRecipEstimate: state.Fpcr = 0x{state.Fpcr:X8}"); + + value.FPUnpack(out FpType type, out bool sign, out uint op, state); + + float result; + + if (type == FpType.SNaN || type == FpType.QNaN) + { + result = FPProcessNaN(type, op, state); + } + else if (type == FpType.Infinity) + { + result = FPZero(sign); + } + else if (type == FpType.Zero) + { + result = FPInfinity(sign); + + FPProcessException(FpExc.DivideByZero, state); + } + else if (MathF.Abs(value) < MathF.Pow(2f, -128)) + { + bool overflowToInf; + + switch (state.FPRoundingMode()) + { + default: + case RoundMode.ToNearest: overflowToInf = true; break; + case RoundMode.TowardsPlusInfinity: overflowToInf = !sign; break; + case RoundMode.TowardsMinusInfinity: overflowToInf = sign; break; + case RoundMode.TowardsZero: overflowToInf = false; break; + } + + result = overflowToInf ? FPInfinity(sign) : FPMaxNormal(sign); + + FPProcessException(FpExc.Overflow, state); + FPProcessException(FpExc.Inexact, state); + } + else if (state.GetFpcrFlag(Fpcr.Fz) && (MathF.Abs(value) >= MathF.Pow(2f, 126))) + { + result = FPZero(sign); + + state.SetFpsrFlag(Fpsr.Ufc); + } + else + { + ulong fraction = (ulong)(op & 0x007FFFFFu) << 29; + uint exp = (op & 0x7F800000u) >> 23; + + if (exp == 0u) + { + if ((fraction & 0x0008000000000000ul) == 0ul) + { + fraction = (fraction & 0x0003FFFFFFFFFFFFul) << 2; + exp -= 1u; + } + else + { + fraction = (fraction & 0x0007FFFFFFFFFFFFul) << 1; + } + } + + uint scaled = (uint)(((fraction & 0x000FF00000000000ul) | 0x0010000000000000ul) >> 44); + + uint resultExp = 253u - exp; + + uint estimate = (uint)SoftFloat.RecipEstimateTable[scaled - 256u] + 256u; + + fraction = (ulong)(estimate & 0xFFu) << 44; + + if (resultExp == 0u) + { + fraction = ((fraction & 0x000FFFFFFFFFFFFEul) | 0x0010000000000000ul) >> 1; + } + else if (resultExp + 1u == 0u) + { + fraction = ((fraction & 0x000FFFFFFFFFFFFCul) | 0x0010000000000000ul) >> 2; + resultExp = 0u; + } + + result = BitConverter.Int32BitsToSingle( + (int)((sign ? 1u : 0u) << 31 | (resultExp & 0xFFu) << 23 | (uint)(fraction >> 29) & 0x007FFFFFu)); + } + + return result; + } + public static float FPRecipStepFused(float value1, float value2, CpuThreadState state) { Debug.WriteLineIf(state.Fpcr != 0, $"SoftFloat32.FPRecipStepFused: state.Fpcr = 0x{state.Fpcr:X8}"); @@ -1255,6 +1313,71 @@ namespace ChocolArm64.Instructions return result; } + public static float FPRSqrtEstimate(float value, CpuThreadState state) + { + Debug.WriteLineIf(state.Fpcr != 0, $"SoftFloat32.FPRSqrtEstimate: state.Fpcr = 0x{state.Fpcr:X8}"); + + value.FPUnpack(out FpType type, out bool sign, out uint op, state); + + float result; + + if (type == FpType.SNaN || type == FpType.QNaN) + { + result = FPProcessNaN(type, op, state); + } + else if (type == FpType.Zero) + { + result = FPInfinity(sign); + + FPProcessException(FpExc.DivideByZero, state); + } + else if (sign) + { + result = FPDefaultNaN(); + + FPProcessException(FpExc.InvalidOp, state); + } + else if (type == FpType.Infinity) + { + result = FPZero(false); + } + else + { + ulong fraction = (ulong)(op & 0x007FFFFFu) << 29; + uint exp = (op & 0x7F800000u) >> 23; + + if (exp == 0u) + { + while ((fraction & 0x0008000000000000ul) == 0ul) + { + fraction = (fraction & 0x0007FFFFFFFFFFFFul) << 1; + exp -= 1u; + } + + fraction = (fraction & 0x0007FFFFFFFFFFFFul) << 1; + } + + uint scaled; + + if ((exp & 1u) == 0u) + { + scaled = (uint)(((fraction & 0x000FF00000000000ul) | 0x0010000000000000ul) >> 44); + } + else + { + scaled = (uint)(((fraction & 0x000FE00000000000ul) | 0x0010000000000000ul) >> 45); + } + + uint resultExp = (380u - exp) >> 1; + + uint estimate = (uint)SoftFloat.RecipSqrtEstimateTable[scaled - 128u] + 256u; + + result = BitConverter.Int32BitsToSingle((int)((resultExp & 0xFFu) << 23 | (estimate & 0xFFu) << 15)); + } + + return result; + } + public static float FPRSqrtStepFused(float value1, float value2, CpuThreadState state) { Debug.WriteLineIf(state.Fpcr != 0, $"SoftFloat32.FPRSqrtStepFused: state.Fpcr = 0x{state.Fpcr:X8}"); @@ -1402,6 +1525,11 @@ namespace ChocolArm64.Instructions return sign ? -0f : +0f; } + private static float FPMaxNormal(bool sign) + { + return sign ? float.MinValue : float.MaxValue; + } + private static float FPTwo(bool sign) { return sign ? -2f : +2f; @@ -1417,6 +1545,11 @@ namespace ChocolArm64.Instructions return -value; } + private static float ZerosOrOnes(bool zeros) + { + return BitConverter.Int32BitsToSingle(!zeros ? 0 : -1); + } + private static float FPUnpack( this float value, out FpType type, @@ -1658,6 +1791,94 @@ namespace ChocolArm64.Instructions return result; } + public static double FPCompareEQ(double value1, double value2, CpuThreadState state) + { + Debug.WriteLineIf(state.Fpcr != 0, $"SoftFloat64.FPCompareEQ: state.Fpcr = 0x{state.Fpcr:X8}"); + + value1 = value1.FPUnpack(out FpType type1, out _, out _, state); + value2 = value2.FPUnpack(out FpType type2, out _, out _, state); + + double result; + + if (type1 == FpType.SNaN || type1 == FpType.QNaN || type2 == FpType.SNaN || type2 == FpType.QNaN) + { + result = ZerosOrOnes(false); + + if (type1 == FpType.SNaN || type2 == FpType.SNaN) + { + FPProcessException(FpExc.InvalidOp, state); + } + } + else + { + result = ZerosOrOnes(value1 == value2); + } + + return result; + } + + public static double FPCompareGE(double value1, double value2, CpuThreadState state) + { + Debug.WriteLineIf(state.Fpcr != 0, $"SoftFloat64.FPCompareGE: state.Fpcr = 0x{state.Fpcr:X8}"); + + value1 = value1.FPUnpack(out FpType type1, out _, out _, state); + value2 = value2.FPUnpack(out FpType type2, out _, out _, state); + + double result; + + if (type1 == FpType.SNaN || type1 == FpType.QNaN || type2 == FpType.SNaN || type2 == FpType.QNaN) + { + result = ZerosOrOnes(false); + + FPProcessException(FpExc.InvalidOp, state); + } + else + { + result = ZerosOrOnes(value1 >= value2); + } + + return result; + } + + public static double FPCompareGT(double value1, double value2, CpuThreadState state) + { + Debug.WriteLineIf(state.Fpcr != 0, $"SoftFloat64.FPCompareGT: state.Fpcr = 0x{state.Fpcr:X8}"); + + value1 = value1.FPUnpack(out FpType type1, out _, out _, state); + value2 = value2.FPUnpack(out FpType type2, out _, out _, state); + + double result; + + if (type1 == FpType.SNaN || type1 == FpType.QNaN || type2 == FpType.SNaN || type2 == FpType.QNaN) + { + result = ZerosOrOnes(false); + + FPProcessException(FpExc.InvalidOp, state); + } + else + { + result = ZerosOrOnes(value1 > value2); + } + + return result; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static double FPCompareLE(double value1, double value2, CpuThreadState state) + { + Debug.WriteLineIf(state.Fpcr != 0, $"SoftFloat64.FPCompareLE: state.Fpcr = 0x{state.Fpcr:X8}"); + + return FPCompareGE(value2, value1, state); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static double FPCompareLT(double value1, double value2, CpuThreadState state) + { + Debug.WriteLineIf(state.Fpcr != 0, $"SoftFloat64.FPCompareLT: state.Fpcr = 0x{state.Fpcr:X8}"); + + return FPCompareGT(value2, value1, state); + } + public static double FPDiv(double value1, double value2, CpuThreadState state) { Debug.WriteLineIf(state.Fpcr != 0, $"SoftFloat64.FPDiv: state.Fpcr = 0x{state.Fpcr:X8}"); @@ -2020,6 +2241,95 @@ namespace ChocolArm64.Instructions return result; } + public static double FPRecipEstimate(double value, CpuThreadState state) + { + Debug.WriteLineIf(state.Fpcr != 0, $"SoftFloat64.FPRecipEstimate: state.Fpcr = 0x{state.Fpcr:X8}"); + + value.FPUnpack(out FpType type, out bool sign, out ulong op, state); + + double result; + + if (type == FpType.SNaN || type == FpType.QNaN) + { + result = FPProcessNaN(type, op, state); + } + else if (type == FpType.Infinity) + { + result = FPZero(sign); + } + else if (type == FpType.Zero) + { + result = FPInfinity(sign); + + FPProcessException(FpExc.DivideByZero, state); + } + else if (Math.Abs(value) < Math.Pow(2d, -1024)) + { + bool overflowToInf; + + switch (state.FPRoundingMode()) + { + default: + case RoundMode.ToNearest: overflowToInf = true; break; + case RoundMode.TowardsPlusInfinity: overflowToInf = !sign; break; + case RoundMode.TowardsMinusInfinity: overflowToInf = sign; break; + case RoundMode.TowardsZero: overflowToInf = false; break; + } + + result = overflowToInf ? FPInfinity(sign) : FPMaxNormal(sign); + + FPProcessException(FpExc.Overflow, state); + FPProcessException(FpExc.Inexact, state); + } + else if (state.GetFpcrFlag(Fpcr.Fz) && (Math.Abs(value) >= Math.Pow(2d, 1022))) + { + result = FPZero(sign); + + state.SetFpsrFlag(Fpsr.Ufc); + } + else + { + ulong fraction = op & 0x000FFFFFFFFFFFFFul; + uint exp = (uint)((op & 0x7FF0000000000000ul) >> 52); + + if (exp == 0u) + { + if ((fraction & 0x0008000000000000ul) == 0ul) + { + fraction = (fraction & 0x0003FFFFFFFFFFFFul) << 2; + exp -= 1u; + } + else + { + fraction = (fraction & 0x0007FFFFFFFFFFFFul) << 1; + } + } + + uint scaled = (uint)(((fraction & 0x000FF00000000000ul) | 0x0010000000000000ul) >> 44); + + uint resultExp = 2045u - exp; + + uint estimate = (uint)SoftFloat.RecipEstimateTable[scaled - 256u] + 256u; + + fraction = (ulong)(estimate & 0xFFu) << 44; + + if (resultExp == 0u) + { + fraction = ((fraction & 0x000FFFFFFFFFFFFEul) | 0x0010000000000000ul) >> 1; + } + else if (resultExp + 1u == 0u) + { + fraction = ((fraction & 0x000FFFFFFFFFFFFCul) | 0x0010000000000000ul) >> 2; + resultExp = 0u; + } + + result = BitConverter.Int64BitsToDouble( + (long)((sign ? 1ul : 0ul) << 63 | (resultExp & 0x7FFul) << 52 | (fraction & 0x000FFFFFFFFFFFFFul))); + } + + return result; + } + public static double FPRecipStepFused(double value1, double value2, CpuThreadState state) { Debug.WriteLineIf(state.Fpcr != 0, $"SoftFloat64.FPRecipStepFused: state.Fpcr = 0x{state.Fpcr:X8}"); @@ -2087,6 +2397,71 @@ namespace ChocolArm64.Instructions return result; } + public static double FPRSqrtEstimate(double value, CpuThreadState state) + { + Debug.WriteLineIf(state.Fpcr != 0, $"SoftFloat64.FPRSqrtEstimate: state.Fpcr = 0x{state.Fpcr:X8}"); + + value.FPUnpack(out FpType type, out bool sign, out ulong op, state); + + double result; + + if (type == FpType.SNaN || type == FpType.QNaN) + { + result = FPProcessNaN(type, op, state); + } + else if (type == FpType.Zero) + { + result = FPInfinity(sign); + + FPProcessException(FpExc.DivideByZero, state); + } + else if (sign) + { + result = FPDefaultNaN(); + + FPProcessException(FpExc.InvalidOp, state); + } + else if (type == FpType.Infinity) + { + result = FPZero(false); + } + else + { + ulong fraction = op & 0x000FFFFFFFFFFFFFul; + uint exp = (uint)((op & 0x7FF0000000000000ul) >> 52); + + if (exp == 0u) + { + while ((fraction & 0x0008000000000000ul) == 0ul) + { + fraction = (fraction & 0x0007FFFFFFFFFFFFul) << 1; + exp -= 1u; + } + + fraction = (fraction & 0x0007FFFFFFFFFFFFul) << 1; + } + + uint scaled; + + if ((exp & 1u) == 0u) + { + scaled = (uint)(((fraction & 0x000FF00000000000ul) | 0x0010000000000000ul) >> 44); + } + else + { + scaled = (uint)(((fraction & 0x000FE00000000000ul) | 0x0010000000000000ul) >> 45); + } + + uint resultExp = (3068u - exp) >> 1; + + uint estimate = (uint)SoftFloat.RecipSqrtEstimateTable[scaled - 128u] + 256u; + + result = BitConverter.Int64BitsToDouble((long)((resultExp & 0x7FFul) << 52 | (estimate & 0xFFul) << 44)); + } + + return result; + } + public static double FPRSqrtStepFused(double value1, double value2, CpuThreadState state) { Debug.WriteLineIf(state.Fpcr != 0, $"SoftFloat64.FPRSqrtStepFused: state.Fpcr = 0x{state.Fpcr:X8}"); @@ -2234,6 +2609,11 @@ namespace ChocolArm64.Instructions return sign ? -0d : +0d; } + private static double FPMaxNormal(bool sign) + { + return sign ? double.MinValue : double.MaxValue; + } + private static double FPTwo(bool sign) { return sign ? -2d : +2d; @@ -2249,6 +2629,11 @@ namespace ChocolArm64.Instructions return -value; } + private static double ZerosOrOnes(bool zeros) + { + return BitConverter.Int64BitsToDouble(!zeros ? 0L : -1L); + } + private static double FPUnpack( this double value, out FpType type, diff --git a/ChocolArm64/OpCodeTable.cs b/ChocolArm64/OpCodeTable.cs index 845a48d0..adb71ae7 100644 --- a/ChocolArm64/OpCodeTable.cs +++ b/ChocolArm64/OpCodeTable.cs @@ -222,6 +222,7 @@ namespace ChocolArm64 SetA64("0x101110001xxxxx000111xxxxxxxxxx", InstEmit.Eor_V, typeof(OpCodeSimdReg64)); SetA64("0>101110000xxxxx01011101<1xxxxx110101xxxxxxxxxx", InstEmit.Fabd_V, typeof(OpCodeSimdReg64)); SetA64("000111100x100000110000xxxxxxxxxx", InstEmit.Fabs_S, typeof(OpCodeSimd64)); SetA64("0>0011101<100000111110xxxxxxxxxx", InstEmit.Fabs_V, typeof(OpCodeSimd64)); SetA64("000111100x1xxxxx001010xxxxxxxxxx", InstEmit.Fadd_S, typeof(OpCodeSimdReg64)); diff --git a/ChocolArm64/Optimizations.cs b/ChocolArm64/Optimizations.cs index aab5eca7..8fa6f462 100644 --- a/ChocolArm64/Optimizations.cs +++ b/ChocolArm64/Optimizations.cs @@ -8,12 +8,14 @@ public static class Optimizations private static bool _useSseIfAvailable = true; private static bool _useSse2IfAvailable = true; + private static bool _useSse3IfAvailable = true; private static bool _useSsse3IfAvailable = true; private static bool _useSse41IfAvailable = true; private static bool _useSse42IfAvailable = true; internal static bool UseSse = (_useAllSseIfAvailable && _useSseIfAvailable) && Sse.IsSupported; internal static bool UseSse2 = (_useAllSseIfAvailable && _useSse2IfAvailable) && Sse2.IsSupported; + internal static bool UseSse3 = (_useAllSseIfAvailable && _useSse3IfAvailable) && Sse3.IsSupported; internal static bool UseSsse3 = (_useAllSseIfAvailable && _useSsse3IfAvailable) && Ssse3.IsSupported; internal static bool UseSse41 = (_useAllSseIfAvailable && _useSse41IfAvailable) && Sse41.IsSupported; internal static bool UseSse42 = (_useAllSseIfAvailable && _useSse42IfAvailable) && Sse42.IsSupported; diff --git a/Ryujinx.Tests/Cpu/CpuTestSimd.cs b/Ryujinx.Tests/Cpu/CpuTestSimd.cs index 54889eee..565b6613 100644 --- a/Ryujinx.Tests/Cpu/CpuTestSimd.cs +++ b/Ryujinx.Tests/Cpu/CpuTestSimd.cs @@ -256,6 +256,112 @@ namespace Ryujinx.Tests.Cpu #endregion #region "ValueSource (Opcodes)" + private static uint[] _F_Abs_Neg_Recpx_Sqrt_S_S_() + { + return new uint[] + { + 0x1E20C020u, // FABS S0, S1 + 0x1E214020u, // FNEG S0, S1 + 0x5EA1F820u, // FRECPX S0, S1 + 0x1E21C020u // FSQRT S0, S1 + }; + } + + private static uint[] _F_Abs_Neg_Recpx_Sqrt_S_D_() + { + return new uint[] + { + 0x1E60C020u, // FABS D0, D1 + 0x1E614020u, // FNEG D0, D1 + 0x5EE1F820u, // FRECPX D0, D1 + 0x1E61C020u // FSQRT D0, D1 + }; + } + + private static uint[] _F_Abs_Neg_Sqrt_V_2S_4S_() + { + return new uint[] + { + 0x0EA0F800u, // FABS V0.2S, V0.2S + 0x2EA0F800u, // FNEG V0.2S, V0.2S + 0x2EA1F800u // FSQRT V0.2S, V0.2S + }; + } + + private static uint[] _F_Abs_Neg_Sqrt_V_2D_() + { + return new uint[] + { + 0x4EE0F800u, // FABS V0.2D, V0.2D + 0x6EE0F800u, // FNEG V0.2D, V0.2D + 0x6EE1F800u // FSQRT V0.2D, V0.2D + }; + } + + private static uint[] _F_Add_P_S_2SS_() + { + return new uint[] + { + 0x7E30D820u // FADDP S0, V1.2S + }; + } + + private static uint[] _F_Add_P_S_2DD_() + { + return new uint[] + { + 0x7E70D820u // FADDP D0, V1.2D + }; + } + + private static uint[] _F_Cm_EqGeGtLeLt_S_S_() + { + return new uint[] + { + 0x5EA0D820u, // FCMEQ S0, S1, #0.0 + 0x7EA0C820u, // FCMGE S0, S1, #0.0 + 0x5EA0C820u, // FCMGT S0, S1, #0.0 + 0x7EA0D820u, // FCMLE S0, S1, #0.0 + 0x5EA0E820u // FCMLT S0, S1, #0.0 + }; + } + + private static uint[] _F_Cm_EqGeGtLeLt_S_D_() + { + return new uint[] + { + 0x5EE0D820u, // FCMEQ D0, D1, #0.0 + 0x7EE0C820u, // FCMGE D0, D1, #0.0 + 0x5EE0C820u, // FCMGT D0, D1, #0.0 + 0x7EE0D820u, // FCMLE D0, D1, #0.0 + 0x5EE0E820u // FCMLT D0, D1, #0.0 + }; + } + + private static uint[] _F_Cm_EqGeGtLeLt_V_2S_4S_() + { + return new uint[] + { + 0x0EA0D800u, // FCMEQ V0.2S, V0.2S, #0.0 + 0x2EA0C800u, // FCMGE V0.2S, V0.2S, #0.0 + 0x0EA0C800u, // FCMGT V0.2S, V0.2S, #0.0 + 0x2EA0D800u, // FCMLE V0.2S, V0.2S, #0.0 + 0x0EA0E800u // FCMLT V0.2S, V0.2S, #0.0 + }; + } + + private static uint[] _F_Cm_EqGeGtLeLt_V_2D_() + { + return new uint[] + { + 0x4EE0D800u, // FCMEQ V0.2D, V0.2D, #0.0 + 0x6EE0C800u, // FCMGE V0.2D, V0.2D, #0.0 + 0x4EE0C800u, // FCMGT V0.2D, V0.2D, #0.0 + 0x6EE0D800u, // FCMLE V0.2D, V0.2D, #0.0 + 0x4EE0E800u // FCMLT V0.2D, V0.2D, #0.0 + }; + } + private static uint[] _F_Cmp_Cmpe_S_S_() { return new uint[] @@ -366,45 +472,39 @@ namespace Ryujinx.Tests.Cpu }; } - private static uint[] _F_Abs_Neg_Recpx_Sqrt_S_S_() + private static uint[] _F_Recpe_Rsqrte_S_S_() { return new uint[] { - 0x1E20C020u, // FABS S0, S1 - 0x1E214020u, // FNEG S0, S1 - 0x5EA1F820u, // FRECPX S0, S1 - 0x1E21C020u // FSQRT S0, S1 + 0x5EA1D820u, // FRECPE S0, S1 + 0x7EA1D820u // FRSQRTE S0, S1 }; } - private static uint[] _F_Abs_Neg_Recpx_Sqrt_S_D_() + private static uint[] _F_Recpe_Rsqrte_S_D_() { return new uint[] { - 0x1E60C020u, // FABS D0, D1 - 0x1E614020u, // FNEG D0, D1 - 0x5EE1F820u, // FRECPX D0, D1 - 0x1E61C020u // FSQRT D0, D1 + 0x5EE1D820u, // FRECPE D0, D1 + 0x7EE1D820u // FRSQRTE D0, D1 }; } - private static uint[] _F_Abs_Neg_Sqrt_V_2S_4S_() + private static uint[] _F_Recpe_Rsqrte_V_2S_4S_() { return new uint[] { - 0x0EA0F800u, // FABS V0.2S, V0.2S - 0x2EA0F800u, // FNEG V0.2S, V0.2S - 0x2EA1F800u // FSQRT V0.2S, V0.2S + 0x0EA1D800u, // FRECPE V0.2S, V0.2S + 0x2EA1D800u // FRSQRTE V0.2S, V0.2S }; } - private static uint[] _F_Abs_Neg_Sqrt_V_2D_() + private static uint[] _F_Recpe_Rsqrte_V_2D_() { return new uint[] { - 0x4EE0F800u, // FABS V0.2D, V0.2D - 0x6EE0F800u, // FNEG V0.2D, V0.2D - 0x6EE1F800u // FSQRT V0.2D, V0.2D + 0x4EE1D800u, // FRECPE V0.2D, V0.2D + 0x6EE1D800u // FRSQRTE V0.2D, V0.2D }; } @@ -963,6 +1063,202 @@ namespace Ryujinx.Tests.Cpu CompareAgainstUnicorn(); } + [Test, Pairwise] [Explicit] + public void F_Abs_Neg_Recpx_Sqrt_S_S([ValueSource("_F_Abs_Neg_Recpx_Sqrt_S_S_")] uint opcodes, + [ValueSource("_1S_F_")] ulong a) + { + ulong z = TestContext.CurrentContext.Random.NextULong(); + Vector128 v0 = MakeVectorE0E1(z, z); + Vector128 v1 = MakeVectorE0(a); + + int rnd = (int)TestContext.CurrentContext.Random.NextUInt(); + + int fpcr = rnd & (1 << (int)Fpcr.Fz); + fpcr |= rnd & (1 << (int)Fpcr.Dn); + + SingleOpcode(opcodes, v0: v0, v1: v1, fpcr: fpcr); + + CompareAgainstUnicorn(fpsrMask: Fpsr.Ioc | Fpsr.Idc); + } + + [Test, Pairwise] [Explicit] + public void F_Abs_Neg_Recpx_Sqrt_S_D([ValueSource("_F_Abs_Neg_Recpx_Sqrt_S_D_")] uint opcodes, + [ValueSource("_1D_F_")] ulong a) + { + ulong z = TestContext.CurrentContext.Random.NextULong(); + Vector128 v0 = MakeVectorE1(z); + Vector128 v1 = MakeVectorE0(a); + + int rnd = (int)TestContext.CurrentContext.Random.NextUInt(); + + int fpcr = rnd & (1 << (int)Fpcr.Fz); + fpcr |= rnd & (1 << (int)Fpcr.Dn); + + SingleOpcode(opcodes, v0: v0, v1: v1, fpcr: fpcr); + + CompareAgainstUnicorn(fpsrMask: Fpsr.Ioc | Fpsr.Idc); + } + + [Test, Pairwise] [Explicit] + public void F_Abs_Neg_Sqrt_V_2S_4S([ValueSource("_F_Abs_Neg_Sqrt_V_2S_4S_")] uint opcodes, + [Values(0u)] uint rd, + [Values(1u, 0u)] uint rn, + [ValueSource("_2S_F_")] ulong z, + [ValueSource("_2S_F_")] ulong a, + [Values(0b0u, 0b1u)] uint q) // <2S, 4S> + { + opcodes |= ((rn & 31) << 5) | ((rd & 31) << 0); + opcodes |= ((q & 1) << 30); + + Vector128 v0 = MakeVectorE0E1(z, z); + Vector128 v1 = MakeVectorE0E1(a, a * q); + + int rnd = (int)TestContext.CurrentContext.Random.NextUInt(); + + int fpcr = rnd & (1 << (int)Fpcr.Fz); + fpcr |= rnd & (1 << (int)Fpcr.Dn); + + SingleOpcode(opcodes, v0: v0, v1: v1, fpcr: fpcr); + + CompareAgainstUnicorn(fpsrMask: Fpsr.Ioc | Fpsr.Idc); + } + + [Test, Pairwise] [Explicit] + public void F_Abs_Neg_Sqrt_V_2D([ValueSource("_F_Abs_Neg_Sqrt_V_2D_")] uint opcodes, + [Values(0u)] uint rd, + [Values(1u, 0u)] uint rn, + [ValueSource("_1D_F_")] ulong z, + [ValueSource("_1D_F_")] ulong a) + { + opcodes |= ((rn & 31) << 5) | ((rd & 31) << 0); + + Vector128 v0 = MakeVectorE0E1(z, z); + Vector128 v1 = MakeVectorE0E1(a, a); + + int rnd = (int)TestContext.CurrentContext.Random.NextUInt(); + + int fpcr = rnd & (1 << (int)Fpcr.Fz); + fpcr |= rnd & (1 << (int)Fpcr.Dn); + + SingleOpcode(opcodes, v0: v0, v1: v1, fpcr: fpcr); + + CompareAgainstUnicorn(fpsrMask: Fpsr.Ioc | Fpsr.Idc); + } + + [Test, Pairwise] [Explicit] + public void F_Add_P_S_2SS([ValueSource("_F_Add_P_S_2SS_")] uint opcodes, + [ValueSource("_2S_F_")] ulong a) + { + ulong z = TestContext.CurrentContext.Random.NextULong(); + Vector128 v0 = MakeVectorE0E1(z, z); + Vector128 v1 = MakeVectorE0(a); + + int rnd = (int)TestContext.CurrentContext.Random.NextUInt(); + + int fpcr = rnd & (1 << (int)Fpcr.Fz); + fpcr |= rnd & (1 << (int)Fpcr.Dn); + + SingleOpcode(opcodes, v0: v0, v1: v1, fpcr: fpcr); + + CompareAgainstUnicorn(fpsrMask: Fpsr.Ioc | Fpsr.Idc); + } + + [Test, Pairwise] [Explicit] + public void F_Add_P_S_2DD([ValueSource("_F_Add_P_S_2DD_")] uint opcodes, + [ValueSource("_1D_F_")] ulong a) + { + ulong z = TestContext.CurrentContext.Random.NextULong(); + Vector128 v0 = MakeVectorE1(z); + Vector128 v1 = MakeVectorE0E1(a, a); + + int rnd = (int)TestContext.CurrentContext.Random.NextUInt(); + + int fpcr = rnd & (1 << (int)Fpcr.Fz); + fpcr |= rnd & (1 << (int)Fpcr.Dn); + + SingleOpcode(opcodes, v0: v0, v1: v1, fpcr: fpcr); + + CompareAgainstUnicorn(fpsrMask: Fpsr.Ioc | Fpsr.Idc); + } + + [Test, Pairwise] [Explicit] + public void F_Cm_EqGeGtLeLt_S_S([ValueSource("_F_Cm_EqGeGtLeLt_S_S_")] uint opcodes, + [ValueSource("_1S_F_")] ulong a) + { + ulong z = TestContext.CurrentContext.Random.NextULong(); + Vector128 v0 = MakeVectorE0E1(z, z); + Vector128 v1 = MakeVectorE0(a); + + int rnd = (int)TestContext.CurrentContext.Random.NextUInt(); + + int fpcr = rnd & (1 << (int)Fpcr.Fz); + + SingleOpcode(opcodes, v0: v0, v1: v1, fpcr: fpcr); + + CompareAgainstUnicorn(fpsrMask: Fpsr.Ioc | Fpsr.Idc); + } + + [Test, Pairwise] [Explicit] + public void F_Cm_EqGeGtLeLt_S_D([ValueSource("_F_Cm_EqGeGtLeLt_S_D_")] uint opcodes, + [ValueSource("_1D_F_")] ulong a) + { + ulong z = TestContext.CurrentContext.Random.NextULong(); + Vector128 v0 = MakeVectorE1(z); + Vector128 v1 = MakeVectorE0(a); + + int rnd = (int)TestContext.CurrentContext.Random.NextUInt(); + + int fpcr = rnd & (1 << (int)Fpcr.Fz); + + SingleOpcode(opcodes, v0: v0, v1: v1, fpcr: fpcr); + + CompareAgainstUnicorn(fpsrMask: Fpsr.Ioc | Fpsr.Idc); + } + + [Test, Pairwise] [Explicit] + public void F_Cm_EqGeGtLeLt_V_2S_4S([ValueSource("_F_Cm_EqGeGtLeLt_V_2S_4S_")] uint opcodes, + [Values(0u)] uint rd, + [Values(1u, 0u)] uint rn, + [ValueSource("_2S_F_")] ulong z, + [ValueSource("_2S_F_")] ulong a, + [Values(0b0u, 0b1u)] uint q) // <2S, 4S> + { + opcodes |= ((rn & 31) << 5) | ((rd & 31) << 0); + opcodes |= ((q & 1) << 30); + + Vector128 v0 = MakeVectorE0E1(z, z); + Vector128 v1 = MakeVectorE0E1(a, a * q); + + int rnd = (int)TestContext.CurrentContext.Random.NextUInt(); + + int fpcr = rnd & (1 << (int)Fpcr.Fz); + + SingleOpcode(opcodes, v0: v0, v1: v1, fpcr: fpcr); + + CompareAgainstUnicorn(fpsrMask: Fpsr.Ioc | Fpsr.Idc); + } + + [Test, Pairwise] [Explicit] + public void F_Cm_EqGeGtLeLt_V_2D([ValueSource("_F_Cm_EqGeGtLeLt_V_2D_")] uint opcodes, + [Values(0u)] uint rd, + [Values(1u, 0u)] uint rn, + [ValueSource("_1D_F_")] ulong z, + [ValueSource("_1D_F_")] ulong a) + { + opcodes |= ((rn & 31) << 5) | ((rd & 31) << 0); + + Vector128 v0 = MakeVectorE0E1(z, z); + Vector128 v1 = MakeVectorE0E1(a, a); + + int rnd = (int)TestContext.CurrentContext.Random.NextUInt(); + + int fpcr = rnd & (1 << (int)Fpcr.Fz); + + SingleOpcode(opcodes, v0: v0, v1: v1, fpcr: fpcr); + + CompareAgainstUnicorn(fpsrMask: Fpsr.Ioc | Fpsr.Idc); + } + [Test, Pairwise] [Explicit] public void F_Cmp_Cmpe_S_S([ValueSource("_F_Cmp_Cmpe_S_S_")] uint opcodes, [ValueSource("_1S_F_")] ulong a) @@ -1089,13 +1385,13 @@ namespace Ryujinx.Tests.Cpu [Values(1u, 0u)] uint rn, [ValueSource("_4H_F_")] ulong z, [ValueSource("_4H_F_")] ulong a, - [Values(0b0u, 0b1u)] uint q, // <4H, 8H> + [Values(0b0u, 0b1u)] uint q, // <4H4S, 8H4S> [Values(RMode.Rn)] RMode rMode) { opcodes |= ((rn & 31) << 5) | ((rd & 31) << 0); opcodes |= ((q & 1) << 30); - Vector128 v0 = MakeVectorE0E1(q == 0u ? z : 0ul, q == 1u ? z : 0ul); + Vector128 v0 = MakeVectorE0E1(z, z); Vector128 v1 = MakeVectorE0E1(q == 0u ? a : 0ul, q == 1u ? a : 0ul); int rnd = (int)TestContext.CurrentContext.Random.NextUInt(); @@ -1116,12 +1412,12 @@ namespace Ryujinx.Tests.Cpu [Values(1u, 0u)] uint rn, [ValueSource("_2S_F_")] ulong z, [ValueSource("_2S_F_")] ulong a, - [Values(0b0u, 0b1u)] uint q) // <2S, 4S> + [Values(0b0u, 0b1u)] uint q) // <2S2D, 4S2D> { opcodes |= ((rn & 31) << 5) | ((rd & 31) << 0); opcodes |= ((q & 1) << 30); - Vector128 v0 = MakeVectorE0E1(q == 0u ? z : 0ul, q == 1u ? z : 0ul); + Vector128 v0 = MakeVectorE0E1(z, z); Vector128 v1 = MakeVectorE0E1(q == 0u ? a : 0ul, q == 1u ? a : 0ul); SingleOpcode(opcodes, v0: v0, v1: v1); @@ -1135,7 +1431,7 @@ namespace Ryujinx.Tests.Cpu [Values(1u, 0u)] uint rn, [ValueSource("_2S_F_")] ulong z, [ValueSource("_2S_F_")] ulong a, - [Values(0b0u, 0b1u)] uint q, // <4H, 8H> + [Values(0b0u, 0b1u)] uint q, // <4S4H, 4S8H> [Values(RMode.Rn)] RMode rMode) { opcodes |= ((rn & 31) << 5) | ((rd & 31) << 0); @@ -1162,7 +1458,7 @@ namespace Ryujinx.Tests.Cpu [Values(1u, 0u)] uint rn, [ValueSource("_1D_F_")] ulong z, [ValueSource("_1D_F_")] ulong a, - [Values(0b0u, 0b1u)] uint q) // <2S, 4S> + [Values(0b0u, 0b1u)] uint q) // <2D2S, 2D4S> { opcodes |= ((rn & 31) << 5) | ((rd & 31) << 0); opcodes |= ((q & 1) << 30); @@ -1176,48 +1472,53 @@ namespace Ryujinx.Tests.Cpu } [Test, Pairwise] [Explicit] - public void F_Abs_Neg_Recpx_Sqrt_S_S([ValueSource("_F_Abs_Neg_Recpx_Sqrt_S_S_")] uint opcodes, - [ValueSource("_1S_F_")] ulong a) + public void F_Recpe_Rsqrte_S_S([ValueSource("_F_Recpe_Rsqrte_S_S_")] uint opcodes, + [ValueSource("_1S_F_")] ulong a, + [Values(RMode.Rn)] RMode rMode) { ulong z = TestContext.CurrentContext.Random.NextULong(); Vector128 v0 = MakeVectorE0E1(z, z); - Vector128 v1 = MakeVectorE0E1(a, z); + Vector128 v1 = MakeVectorE0(a); int rnd = (int)TestContext.CurrentContext.Random.NextUInt(); - int fpcr = rnd & (1 << (int)Fpcr.Fz); + int fpcr = (int)rMode << (int)Fpcr.RMode; + fpcr |= rnd & (1 << (int)Fpcr.Fz); fpcr |= rnd & (1 << (int)Fpcr.Dn); SingleOpcode(opcodes, v0: v0, v1: v1, fpcr: fpcr); - CompareAgainstUnicorn(fpsrMask: Fpsr.Ioc | Fpsr.Idc); + CompareAgainstUnicorn(fpsrMask: Fpsr.Ioc | Fpsr.Dzc | Fpsr.Ofc | Fpsr.Ufc | Fpsr.Ixc | Fpsr.Idc); } [Test, Pairwise] [Explicit] - public void F_Abs_Neg_Recpx_Sqrt_S_D([ValueSource("_F_Abs_Neg_Recpx_Sqrt_S_D_")] uint opcodes, - [ValueSource("_1D_F_")] ulong a) + public void F_Recpe_Rsqrte_S_D([ValueSource("_F_Recpe_Rsqrte_S_D_")] uint opcodes, + [ValueSource("_1D_F_")] ulong a, + [Values(RMode.Rn)] RMode rMode) { ulong z = TestContext.CurrentContext.Random.NextULong(); Vector128 v0 = MakeVectorE1(z); - Vector128 v1 = MakeVectorE0E1(a, z); + Vector128 v1 = MakeVectorE0(a); int rnd = (int)TestContext.CurrentContext.Random.NextUInt(); - int fpcr = rnd & (1 << (int)Fpcr.Fz); + int fpcr = (int)rMode << (int)Fpcr.RMode; + fpcr |= rnd & (1 << (int)Fpcr.Fz); fpcr |= rnd & (1 << (int)Fpcr.Dn); SingleOpcode(opcodes, v0: v0, v1: v1, fpcr: fpcr); - CompareAgainstUnicorn(fpsrMask: Fpsr.Ioc | Fpsr.Idc); + CompareAgainstUnicorn(fpsrMask: Fpsr.Ioc | Fpsr.Dzc | Fpsr.Ofc | Fpsr.Ufc | Fpsr.Ixc | Fpsr.Idc); } [Test, Pairwise] [Explicit] - public void F_Abs_Neg_Sqrt_V_2S_4S([ValueSource("_F_Abs_Neg_Sqrt_V_2S_4S_")] uint opcodes, + public void F_Recpe_Rsqrte_V_2S_4S([ValueSource("_F_Recpe_Rsqrte_V_2S_4S_")] uint opcodes, [Values(0u)] uint rd, [Values(1u, 0u)] uint rn, [ValueSource("_2S_F_")] ulong z, [ValueSource("_2S_F_")] ulong a, - [Values(0b0u, 0b1u)] uint q) // <2S, 4S> + [Values(0b0u, 0b1u)] uint q, // <2S, 4S> + [Values(RMode.Rn)] RMode rMode) { opcodes |= ((rn & 31) << 5) | ((rd & 31) << 0); opcodes |= ((q & 1) << 30); @@ -1227,20 +1528,22 @@ namespace Ryujinx.Tests.Cpu int rnd = (int)TestContext.CurrentContext.Random.NextUInt(); - int fpcr = rnd & (1 << (int)Fpcr.Fz); + int fpcr = (int)rMode << (int)Fpcr.RMode; + fpcr |= rnd & (1 << (int)Fpcr.Fz); fpcr |= rnd & (1 << (int)Fpcr.Dn); SingleOpcode(opcodes, v0: v0, v1: v1, fpcr: fpcr); - CompareAgainstUnicorn(fpsrMask: Fpsr.Ioc | Fpsr.Idc); + CompareAgainstUnicorn(fpsrMask: Fpsr.Ioc | Fpsr.Dzc | Fpsr.Ofc | Fpsr.Ufc | Fpsr.Ixc | Fpsr.Idc); } [Test, Pairwise] [Explicit] - public void F_Abs_Neg_Sqrt_V_2D([ValueSource("_F_Abs_Neg_Sqrt_V_2D_")] uint opcodes, + public void F_Recpe_Rsqrte_V_2D([ValueSource("_F_Recpe_Rsqrte_V_2D_")] uint opcodes, [Values(0u)] uint rd, [Values(1u, 0u)] uint rn, [ValueSource("_1D_F_")] ulong z, - [ValueSource("_1D_F_")] ulong a) + [ValueSource("_1D_F_")] ulong a, + [Values(RMode.Rn)] RMode rMode) { opcodes |= ((rn & 31) << 5) | ((rd & 31) << 0); @@ -1249,12 +1552,13 @@ namespace Ryujinx.Tests.Cpu int rnd = (int)TestContext.CurrentContext.Random.NextUInt(); - int fpcr = rnd & (1 << (int)Fpcr.Fz); + int fpcr = (int)rMode << (int)Fpcr.RMode; + fpcr |= rnd & (1 << (int)Fpcr.Fz); fpcr |= rnd & (1 << (int)Fpcr.Dn); SingleOpcode(opcodes, v0: v0, v1: v1, fpcr: fpcr); - CompareAgainstUnicorn(fpsrMask: Fpsr.Ioc | Fpsr.Idc); + CompareAgainstUnicorn(fpsrMask: Fpsr.Ioc | Fpsr.Dzc | Fpsr.Ofc | Fpsr.Ufc | Fpsr.Ixc | Fpsr.Idc); } [Test, Pairwise, Description("NEG , ")] @@ -1662,6 +1966,27 @@ namespace Ryujinx.Tests.Cpu CompareAgainstUnicorn(); } + [Test, Pairwise, Description("SHLL{2} ., ., #")] + public void Shll_V([Values(0u)] uint rd, + [Values(1u, 0u)] uint rn, + [ValueSource("_8B4H2S_")] [Random(RndCnt)] ulong z, + [ValueSource("_8B4H2S_")] [Random(RndCnt)] ulong a, + [Values(0b00u, 0b01u, 0b10u)] uint size, // + [Values(0b0u, 0b1u)] uint q) + { + uint opcode = 0x2E213800; // SHLL V0.8H, V0.8B, #8 + opcode |= ((rn & 31) << 5) | ((rd & 31) << 0); + opcode |= ((size & 3) << 22); + opcode |= ((q & 1) << 30); + + Vector128 v0 = MakeVectorE0E1(z, z); + Vector128 v1 = MakeVectorE0E1(q == 0u ? a : 0ul, q == 1u ? a : 0ul); + + SingleOpcode(opcode, v0: v0, v1: v1); + + CompareAgainstUnicorn(); + } + [Test, Pairwise, Description("SQABS , ")] public void Sqabs_S_B_H_S_D([Values(0u)] uint rd, [Values(1u, 0u)] uint rn, diff --git a/Ryujinx.Tests/Cpu/CpuTestSimdReg.cs b/Ryujinx.Tests/Cpu/CpuTestSimdReg.cs index cceb8b10..8d2f4e9a 100644 --- a/Ryujinx.Tests/Cpu/CpuTestSimdReg.cs +++ b/Ryujinx.Tests/Cpu/CpuTestSimdReg.cs @@ -206,6 +206,7 @@ namespace Ryujinx.Tests.Cpu { return new uint[] { + 0x7EA2D420u, // FABD S0, S1, S2 0x1E222820u, // FADD S0, S1, S2 0x1E221820u, // FDIV S0, S1, S2 0x1E220820u, // FMUL S0, S1, S2 @@ -218,6 +219,7 @@ namespace Ryujinx.Tests.Cpu { return new uint[] { + 0x7EE2D420u, // FABD D0, D1, D2 0x1E622820u, // FADD D0, D1, D2 0x1E621820u, // FDIV D0, D1, D2 0x1E620820u, // FMUL D0, D1, D2 @@ -226,11 +228,13 @@ namespace Ryujinx.Tests.Cpu }; } - private static uint[] _F_Add_Div_Mul_Mulx_Sub_V_2S_4S_() + private static uint[] _F_Add_Div_Mul_Mulx_Sub_P_V_2S_4S_() { return new uint[] { + 0x2EA0D400u, // FABD V0.2S, V0.2S, V0.2S 0x0E20D400u, // FADD V0.2S, V0.2S, V0.2S + 0x2E20D400u, // FADDP V0.2S, V0.2S, V0.2S 0x2E20FC00u, // FDIV V0.2S, V0.2S, V0.2S 0x2E20DC00u, // FMUL V0.2S, V0.2S, V0.2S 0x0E20DC00u, // FMULX V0.2S, V0.2S, V0.2S @@ -238,11 +242,13 @@ namespace Ryujinx.Tests.Cpu }; } - private static uint[] _F_Add_Div_Mul_Mulx_Sub_V_2D_() + private static uint[] _F_Add_Div_Mul_Mulx_Sub_P_V_2D_() { return new uint[] { + 0x6EE0D400u, // FABD V0.2D, V0.2D, V0.2D 0x4E60D400u, // FADD V0.2D, V0.2D, V0.2D + 0x6E60D400u, // FADDP V0.2D, V0.2D, V0.2D 0x6E60FC00u, // FDIV V0.2D, V0.2D, V0.2D 0x6E60DC00u, // FMUL V0.2D, V0.2D, V0.2D 0x4E60DC00u, // FMULX V0.2D, V0.2D, V0.2D @@ -250,6 +256,46 @@ namespace Ryujinx.Tests.Cpu }; } + private static uint[] _F_Cm_EqGeGt_S_S_() + { + return new uint[] + { + 0x5E22E420u, // FCMEQ S0, S1, S2 + 0x7E22E420u, // FCMGE S0, S1, S2 + 0x7EA2E420u // FCMGT S0, S1, S2 + }; + } + + private static uint[] _F_Cm_EqGeGt_S_D_() + { + return new uint[] + { + 0x5E62E420u, // FCMEQ D0, D1, D2 + 0x7E62E420u, // FCMGE D0, D1, D2 + 0x7EE2E420u // FCMGT D0, D1, D2 + }; + } + + private static uint[] _F_Cm_EqGeGt_V_2S_4S_() + { + return new uint[] + { + 0x0E20E400u, // FCMEQ V0.2S, V0.2S, V0.2S + 0x2E20E400u, // FCMGE V0.2S, V0.2S, V0.2S + 0x2EA0E400u // FCMGT V0.2S, V0.2S, V0.2S + }; + } + + private static uint[] _F_Cm_EqGeGt_V_2D_() + { + return new uint[] + { + 0x4E60E400u, // FCMEQ V0.2D, V0.2D, V0.2D + 0x6E60E400u, // FCMGE V0.2D, V0.2D, V0.2D + 0x6EE0E400u // FCMGT V0.2D, V0.2D, V0.2D + }; + } + private static uint[] _F_Cmp_Cmpe_S_S_() { return new uint[] @@ -1285,14 +1331,14 @@ namespace Ryujinx.Tests.Cpu } [Test, Pairwise] [Explicit] - public void F_Add_Div_Mul_Mulx_Sub_V_2S_4S([ValueSource("_F_Add_Div_Mul_Mulx_Sub_V_2S_4S_")] uint opcodes, - [Values(0u)] uint rd, - [Values(1u, 0u)] uint rn, - [Values(2u, 0u)] uint rm, - [ValueSource("_2S_F_")] ulong z, - [ValueSource("_2S_F_")] ulong a, - [ValueSource("_2S_F_")] ulong b, - [Values(0b0u, 0b1u)] uint q) // <2S, 4S> + public void F_Add_Div_Mul_Mulx_Sub_P_V_2S_4S([ValueSource("_F_Add_Div_Mul_Mulx_Sub_P_V_2S_4S_")] uint opcodes, + [Values(0u)] uint rd, + [Values(1u, 0u)] uint rn, + [Values(2u, 0u)] uint rm, + [ValueSource("_2S_F_")] ulong z, + [ValueSource("_2S_F_")] ulong a, + [ValueSource("_2S_F_")] ulong b, + [Values(0b0u, 0b1u)] uint q) // <2S, 4S> { opcodes |= ((rm & 31) << 16) | ((rn & 31) << 5) | ((rd & 31) << 0); opcodes |= ((q & 1) << 30); @@ -1312,13 +1358,13 @@ namespace Ryujinx.Tests.Cpu } [Test, Pairwise] [Explicit] - public void F_Add_Div_Mul_Mulx_Sub_V_2D([ValueSource("_F_Add_Div_Mul_Mulx_Sub_V_2D_")] uint opcodes, - [Values(0u)] uint rd, - [Values(1u, 0u)] uint rn, - [Values(2u, 0u)] uint rm, - [ValueSource("_1D_F_")] ulong z, - [ValueSource("_1D_F_")] ulong a, - [ValueSource("_1D_F_")] ulong b) + public void F_Add_Div_Mul_Mulx_Sub_P_V_2D([ValueSource("_F_Add_Div_Mul_Mulx_Sub_P_V_2D_")] uint opcodes, + [Values(0u)] uint rd, + [Values(1u, 0u)] uint rn, + [Values(2u, 0u)] uint rm, + [ValueSource("_1D_F_")] ulong z, + [ValueSource("_1D_F_")] ulong a, + [ValueSource("_1D_F_")] ulong b) { opcodes |= ((rm & 31) << 16) | ((rn & 31) << 5) | ((rd & 31) << 0); @@ -1336,6 +1382,94 @@ namespace Ryujinx.Tests.Cpu CompareAgainstUnicorn(fpsrMask: Fpsr.Ioc | Fpsr.Dzc | Fpsr.Idc); } + [Test, Pairwise] [Explicit] + public void F_Cm_EqGeGt_S_S([ValueSource("_F_Cm_EqGeGt_S_S_")] uint opcodes, + [ValueSource("_1S_F_")] ulong a, + [ValueSource("_1S_F_")] ulong b) + { + ulong z = TestContext.CurrentContext.Random.NextULong(); + Vector128 v0 = MakeVectorE0E1(z, z); + Vector128 v1 = MakeVectorE0(a); + Vector128 v2 = MakeVectorE0(b); + + int rnd = (int)TestContext.CurrentContext.Random.NextUInt(); + + int fpcr = rnd & (1 << (int)Fpcr.Fz); + + SingleOpcode(opcodes, v0: v0, v1: v1, v2: v2, fpcr: fpcr); + + CompareAgainstUnicorn(fpsrMask: Fpsr.Ioc | Fpsr.Idc); + } + + [Test, Pairwise] [Explicit] + public void F_Cm_EqGeGt_S_D([ValueSource("_F_Cm_EqGeGt_S_D_")] uint opcodes, + [ValueSource("_1D_F_")] ulong a, + [ValueSource("_1D_F_")] ulong b) + { + ulong z = TestContext.CurrentContext.Random.NextULong(); + Vector128 v0 = MakeVectorE1(z); + Vector128 v1 = MakeVectorE0(a); + Vector128 v2 = MakeVectorE0(b); + + int rnd = (int)TestContext.CurrentContext.Random.NextUInt(); + + int fpcr = rnd & (1 << (int)Fpcr.Fz); + + SingleOpcode(opcodes, v0: v0, v1: v1, v2: v2, fpcr: fpcr); + + CompareAgainstUnicorn(fpsrMask: Fpsr.Ioc | Fpsr.Idc); + } + + [Test, Pairwise] [Explicit] + public void F_Cm_EqGeGt_V_2S_4S([ValueSource("_F_Cm_EqGeGt_V_2S_4S_")] uint opcodes, + [Values(0u)] uint rd, + [Values(1u, 0u)] uint rn, + [Values(2u, 0u)] uint rm, + [ValueSource("_2S_F_")] ulong z, + [ValueSource("_2S_F_")] ulong a, + [ValueSource("_2S_F_")] ulong b, + [Values(0b0u, 0b1u)] uint q) // <2S, 4S> + { + opcodes |= ((rm & 31) << 16) | ((rn & 31) << 5) | ((rd & 31) << 0); + opcodes |= ((q & 1) << 30); + + Vector128 v0 = MakeVectorE0E1(z, z); + Vector128 v1 = MakeVectorE0E1(a, a * q); + Vector128 v2 = MakeVectorE0E1(b, b * q); + + int rnd = (int)TestContext.CurrentContext.Random.NextUInt(); + + int fpcr = rnd & (1 << (int)Fpcr.Fz); + + SingleOpcode(opcodes, v0: v0, v1: v1, v2: v2, fpcr: fpcr); + + CompareAgainstUnicorn(fpsrMask: Fpsr.Ioc | Fpsr.Idc); + } + + [Test, Pairwise] [Explicit] + public void F_Cm_EqGeGt_V_2D([ValueSource("_F_Cm_EqGeGt_V_2D_")] uint opcodes, + [Values(0u)] uint rd, + [Values(1u, 0u)] uint rn, + [Values(2u, 0u)] uint rm, + [ValueSource("_1D_F_")] ulong z, + [ValueSource("_1D_F_")] ulong a, + [ValueSource("_1D_F_")] ulong b) + { + opcodes |= ((rm & 31) << 16) | ((rn & 31) << 5) | ((rd & 31) << 0); + + Vector128 v0 = MakeVectorE0E1(z, z); + Vector128 v1 = MakeVectorE0E1(a, a); + Vector128 v2 = MakeVectorE0E1(b, b); + + int rnd = (int)TestContext.CurrentContext.Random.NextUInt(); + + int fpcr = rnd & (1 << (int)Fpcr.Fz); + + SingleOpcode(opcodes, v0: v0, v1: v1, v2: v2, fpcr: fpcr); + + CompareAgainstUnicorn(fpsrMask: Fpsr.Ioc | Fpsr.Idc); + } + [Test, Pairwise] [Explicit] public void F_Cmp_Cmpe_S_S([ValueSource("_F_Cmp_Cmpe_S_S_")] uint opcodes, [ValueSource("_1S_F_")] ulong a, diff --git a/Ryujinx.Tests/Cpu/CpuTestSimdShImm.cs b/Ryujinx.Tests/Cpu/CpuTestSimdShImm.cs index c9c4c1ed..f026158c 100644 --- a/Ryujinx.Tests/Cpu/CpuTestSimdShImm.cs +++ b/Ryujinx.Tests/Cpu/CpuTestSimdShImm.cs @@ -50,6 +50,33 @@ namespace Ryujinx.Tests.Cpu #endregion #region "ValueSource (Opcodes)" + private static uint[] _SU_Shll_V_8B8H_16B8H_() + { + return new uint[] + { + 0x0F08A400u, // SSHLL V0.8H, V0.8B, #0 + 0x2F08A400u // USHLL V0.8H, V0.8B, #0 + }; + } + + private static uint[] _SU_Shll_V_4H4S_8H4S_() + { + return new uint[] + { + 0x0F10A400u, // SSHLL V0.4S, V0.4H, #0 + 0x2F10A400u // USHLL V0.4S, V0.4H, #0 + }; + } + + private static uint[] _SU_Shll_V_2S2D_4S2D_() + { + return new uint[] + { + 0x0F20A400u, // SSHLL V0.2D, V0.2S, #0 + 0x2F20A400u // USHLL V0.2D, V0.2S, #0 + }; + } + private static uint[] _ShrImm_S_D_() { return new uint[] @@ -344,6 +371,75 @@ namespace Ryujinx.Tests.Cpu CompareAgainstUnicorn(); } + [Test, Pairwise] + public void SU_Shll_V_8B8H_16B8H([ValueSource("_SU_Shll_V_8B8H_16B8H_")] uint opcodes, + [Values(0u)] uint rd, + [Values(1u, 0u)] uint rn, + [ValueSource("_8B_")] [Random(RndCnt)] ulong z, + [ValueSource("_8B_")] [Random(RndCnt)] ulong a, + [Range(0u, 7u)] uint shift, + [Values(0b0u, 0b1u)] uint q) // <8B8H, 16B8H> + { + uint immHb = (8 + shift) & 0x7F; + + opcodes |= ((rn & 31) << 5) | ((rd & 31) << 0); + opcodes |= (immHb << 16); + opcodes |= ((q & 1) << 30); + + Vector128 v0 = MakeVectorE0E1(z, z); + Vector128 v1 = MakeVectorE0E1(q == 0u ? a : 0ul, q == 1u ? a : 0ul); + + SingleOpcode(opcodes, v0: v0, v1: v1); + + CompareAgainstUnicorn(); + } + + [Test, Pairwise] + public void SU_Shll_V_4H4S_8H4S([ValueSource("_SU_Shll_V_4H4S_8H4S_")] uint opcodes, + [Values(0u)] uint rd, + [Values(1u, 0u)] uint rn, + [ValueSource("_4H_")] [Random(RndCnt)] ulong z, + [ValueSource("_4H_")] [Random(RndCnt)] ulong a, + [Range(0u, 15u)] uint shift, + [Values(0b0u, 0b1u)] uint q) // <4H4S, 8H4S> + { + uint immHb = (16 + shift) & 0x7F; + + opcodes |= ((rn & 31) << 5) | ((rd & 31) << 0); + opcodes |= (immHb << 16); + opcodes |= ((q & 1) << 30); + + Vector128 v0 = MakeVectorE0E1(z, z); + Vector128 v1 = MakeVectorE0E1(q == 0u ? a : 0ul, q == 1u ? a : 0ul); + + SingleOpcode(opcodes, v0: v0, v1: v1); + + CompareAgainstUnicorn(); + } + + [Test, Pairwise] + public void SU_Shll_V_2S2D_4S2D([ValueSource("_SU_Shll_V_2S2D_4S2D_")] uint opcodes, + [Values(0u)] uint rd, + [Values(1u, 0u)] uint rn, + [ValueSource("_2S_")] [Random(RndCnt)] ulong z, + [ValueSource("_2S_")] [Random(RndCnt)] ulong a, + [Range(0u, 31u)] uint shift, + [Values(0b0u, 0b1u)] uint q) // <2S2D, 4S2D> + { + uint immHb = (32 + shift) & 0x7F; + + opcodes |= ((rn & 31) << 5) | ((rd & 31) << 0); + opcodes |= (immHb << 16); + opcodes |= ((q & 1) << 30); + + Vector128 v0 = MakeVectorE0E1(z, z); + Vector128 v1 = MakeVectorE0E1(q == 0u ? a : 0ul, q == 1u ? a : 0ul); + + SingleOpcode(opcodes, v0: v0, v1: v1); + + CompareAgainstUnicorn(); + } + [Test, Pairwise] public void ShrImm_S_D([ValueSource("_ShrImm_S_D_")] uint opcodes, [Values(0u)] uint rd,