From 74da8785a5f3a79914182d384e966fb5d27fa708 Mon Sep 17 00:00:00 2001 From: LDj3SNuD <35856442+LDj3SNuD@users.noreply.github.com> Date: Sun, 21 Apr 2019 04:07:35 +0200 Subject: [PATCH] Sse optimized the 32-bit Vector & Scalar integer-to-fp conversion instructions (signed & unsigned); added the related Gp & V_Fixed Tests (signed & unsigned). (#662) * Update CpuTestSimdCvt.cs * Update CpuTestSimd.cs * Update CpuTestSimdShImm.cs * Update InstEmitSimdCvt.cs * Update OpCodeTable.cs * Update InstEmitSimdCvt.cs --- ChocolArm64/Instructions/InstEmitSimdCvt.cs | 191 +++++++++++++++++--- ChocolArm64/OpCodeTable.cs | 4 + Ryujinx.Tests/Cpu/CpuTestSimd.cs | 4 +- Ryujinx.Tests/Cpu/CpuTestSimdCvt.cs | 106 ++++++++++- Ryujinx.Tests/Cpu/CpuTestSimdShImm.cs | 62 +++++++ 5 files changed, 342 insertions(+), 25 deletions(-) diff --git a/ChocolArm64/Instructions/InstEmitSimdCvt.cs b/ChocolArm64/Instructions/InstEmitSimdCvt.cs index ab2fb6a8..c5f16f86 100644 --- a/ChocolArm64/Instructions/InstEmitSimdCvt.cs +++ b/ChocolArm64/Instructions/InstEmitSimdCvt.cs @@ -363,7 +363,7 @@ namespace ChocolArm64.Instructions if (context.CurrOp.RegisterSize == RegisterSize.Int32) { - context.Emit(OpCodes.Conv_U4); + context.Emit(OpCodes.Conv_I4); } EmitFloatCast(context, op.Size); @@ -393,11 +393,20 @@ namespace ChocolArm64.Instructions { OpCodeSimd64 op = (OpCodeSimd64)context.CurrOp; - EmitVectorExtractSx(context, op.Rn, 0, op.Size + 2); + int sizeF = op.Size & 1; - EmitFloatCast(context, op.Size); + if (Optimizations.UseSse2 && sizeF == 0) + { + EmitSse2cvtF_Signed(context, scalar: true); + } + else + { + EmitVectorExtractSx(context, op.Rn, 0, sizeF + 2); - EmitScalarSetF(context, op.Rd, op.Size); + EmitFloatCast(context, sizeF); + + EmitScalarSetF(context, op.Rd, sizeF); + } } public static void Scvtf_V(ILEmitterCtx context) @@ -408,18 +417,24 @@ namespace ChocolArm64.Instructions if (Optimizations.UseSse2 && sizeF == 0) { - Type[] typesCvt = new Type[] { typeof(Vector128) }; + EmitSse2cvtF_Signed(context, scalar: false); + } + else + { + EmitVectorCvtf(context, signed: true); + } + } - context.EmitLdvec(op.Rn); + public static void Scvtf_V_Fixed(ILEmitterCtx context) + { + OpCodeSimd64 op = (OpCodeSimd64)context.CurrOp; - context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ConvertToVector128Single), typesCvt)); + // sizeF == ((OpCodeSimdShImm64)op).Size - 2 + int sizeF = op.Size & 1; - context.EmitStvec(op.Rd); - - if (op.RegisterSize == RegisterSize.Simd64) - { - EmitVectorZeroUpper(context, op.Rd); - } + if (Optimizations.UseSse2 && sizeF == 0) + { + EmitSse2cvtF_Signed(context, scalar: false); } else { @@ -469,18 +484,55 @@ namespace ChocolArm64.Instructions { OpCodeSimd64 op = (OpCodeSimd64)context.CurrOp; - EmitVectorExtractZx(context, op.Rn, 0, op.Size + 2); + int sizeF = op.Size & 1; - context.Emit(OpCodes.Conv_R_Un); + if (Optimizations.UseSse2 && sizeF == 0) + { + EmitSse2cvtF_Unsigned(context, scalar: true); + } + else + { + EmitVectorExtractZx(context, op.Rn, 0, sizeF + 2); - EmitFloatCast(context, op.Size); + context.Emit(OpCodes.Conv_R_Un); - EmitScalarSetF(context, op.Rd, op.Size); + EmitFloatCast(context, sizeF); + + EmitScalarSetF(context, op.Rd, sizeF); + } } public static void Ucvtf_V(ILEmitterCtx context) { - EmitVectorCvtf(context, signed: false); + OpCodeSimd64 op = (OpCodeSimd64)context.CurrOp; + + int sizeF = op.Size & 1; + + if (Optimizations.UseSse2 && sizeF == 0) + { + EmitSse2cvtF_Unsigned(context, scalar: false); + } + else + { + EmitVectorCvtf(context, signed: false); + } + } + + public static void Ucvtf_V_Fixed(ILEmitterCtx context) + { + OpCodeSimd64 op = (OpCodeSimd64)context.CurrOp; + + // sizeF == ((OpCodeSimdShImm64)op).Size - 2 + int sizeF = op.Size & 1; + + if (Optimizations.UseSse2 && sizeF == 0) + { + EmitSse2cvtF_Unsigned(context, scalar: false); + } + else + { + EmitVectorCvtf(context, signed: false); + } } private static void EmitFcvtn(ILEmitterCtx context, bool signed, bool scalar) @@ -838,7 +890,7 @@ namespace ChocolArm64.Instructions int fBits = GetImmShr(fixedOp); // BitConverter.Int32BitsToSingle(fpScaled) == MathF.Pow(2f, fBits) - int fpScaled = 0x40000000 + (fBits - 1) * 0x800000; + int fpScaled = 0x3F800000 + fBits * 0x800000; context.EmitLdc_I4(fpScaled); context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.SetAllVector128), typesSav)); @@ -894,7 +946,7 @@ namespace ChocolArm64.Instructions int fBits = GetImmShr(fixedOp); // BitConverter.Int64BitsToDouble(fpScaled) == Math.Pow(2d, fBits) - long fpScaled = 0x4000000000000000L + (fBits - 1) * 0x10000000000000L; + long fpScaled = 0x3FF0000000000000L + fBits * 0x10000000000000L; context.EmitLdc_I8(fpScaled); context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.SetAllVector128), typesSav)); @@ -972,7 +1024,7 @@ namespace ChocolArm64.Instructions int fBits = GetImmShr(fixedOp); // BitConverter.Int32BitsToSingle(fpScaled) == MathF.Pow(2f, fBits) - int fpScaled = 0x40000000 + (fBits - 1) * 0x800000; + int fpScaled = 0x3F800000 + fBits * 0x800000; context.EmitLdc_I4(fpScaled); context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.SetAllVector128), typesSav)); @@ -1060,7 +1112,7 @@ namespace ChocolArm64.Instructions int fBits = GetImmShr(fixedOp); // BitConverter.Int64BitsToDouble(fpScaled) == Math.Pow(2d, fBits) - long fpScaled = 0x4000000000000000L + (fBits - 1) * 0x10000000000000L; + long fpScaled = 0x3FF0000000000000L + fBits * 0x10000000000000L; context.EmitLdc_I8(fpScaled); context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.SetAllVector128), typesSav)); @@ -1158,6 +1210,101 @@ namespace ChocolArm64.Instructions } } + private static void EmitSse2cvtF_Signed(ILEmitterCtx context, bool scalar) + { + OpCodeSimd64 op = (OpCodeSimd64)context.CurrOp; + + Type[] typesMul = new Type[] { typeof(Vector128), typeof(Vector128) }; + Type[] typesCvt = new Type[] { typeof(Vector128) }; + Type[] typesSav = new Type[] { typeof(int) }; + + context.EmitLdvec(op.Rn); + + context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ConvertToVector128Single), typesCvt)); + + if (op is OpCodeSimdShImm64 fixedOp) + { + int fBits = GetImmShr(fixedOp); + + // BitConverter.Int32BitsToSingle(fpScaled) == 1f / MathF.Pow(2f, fBits) + int fpScaled = 0x3F800000 - fBits * 0x800000; + + context.EmitLdc_I4(fpScaled); + context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.SetAllVector128), typesSav)); + + context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.Multiply), typesMul)); + } + + context.EmitStvec(op.Rd); + + if (scalar) + { + EmitVectorZero32_128(context, op.Rd); + } + else if (op.RegisterSize == RegisterSize.Simd64) + { + EmitVectorZeroUpper(context, op.Rd); + } + } + + private static void EmitSse2cvtF_Unsigned(ILEmitterCtx context, bool scalar) + { + OpCodeSimd64 op = (OpCodeSimd64)context.CurrOp; + + Type[] typesMulAdd = new Type[] { typeof(Vector128), typeof(Vector128) }; + Type[] typesSrlSll = new Type[] { typeof(Vector128), typeof(byte) }; + Type[] typesCvt = new Type[] { typeof(Vector128) }; + Type[] typesSav = new Type[] { typeof(int) }; + + context.EmitLdvec(op.Rn); + + context.EmitLdc_I4(16); + context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical), typesSrlSll)); + + context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ConvertToVector128Single), typesCvt)); + + context.EmitLdc_I4(0x47800000); // 65536.0f (1 << 16) + context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.SetAllVector128), typesSav)); + + context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.Multiply), typesMulAdd)); + + context.EmitLdvec(op.Rn); + + context.EmitLdc_I4(16); + context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftLeftLogical), typesSrlSll)); + + context.EmitLdc_I4(16); + context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical), typesSrlSll)); + + context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ConvertToVector128Single), typesCvt)); + + context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.Add), typesMulAdd)); + + if (op is OpCodeSimdShImm64 fixedOp) + { + int fBits = GetImmShr(fixedOp); + + // BitConverter.Int32BitsToSingle(fpScaled) == 1f / MathF.Pow(2f, fBits) + int fpScaled = 0x3F800000 - fBits * 0x800000; + + context.EmitLdc_I4(fpScaled); + context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.SetAllVector128), typesSav)); + + context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.Multiply), typesMulAdd)); + } + + context.EmitStvec(op.Rd); + + if (scalar) + { + EmitVectorZero32_128(context, op.Rd); + } + else if (op.RegisterSize == RegisterSize.Simd64) + { + EmitVectorZeroUpper(context, op.Rd); + } + } + private static string GetSse41NameRnd(RoundMode roundMode) { switch (roundMode) diff --git a/ChocolArm64/OpCodeTable.cs b/ChocolArm64/OpCodeTable.cs index 50bc6a1d..fb8b19cd 100644 --- a/ChocolArm64/OpCodeTable.cs +++ b/ChocolArm64/OpCodeTable.cs @@ -439,6 +439,8 @@ namespace ChocolArm64 SetA64(">00111100x000010>xxxxxxxxxxxxxxx", InstEmit.Scvtf_Gp_Fixed, typeof(OpCodeSimdCvt64)); SetA64("010111100x100001110110xxxxxxxxxx", InstEmit.Scvtf_S, typeof(OpCodeSimd64)); SetA64("0>0011100<100001110110xxxxxxxxxx", InstEmit.Scvtf_V, typeof(OpCodeSimd64)); + SetA64("0x001111001xxxxx111001xxxxxxxxxx", InstEmit.Scvtf_V_Fixed, typeof(OpCodeSimdShImm64)); + SetA64("0100111101xxxxxx111001xxxxxxxxxx", InstEmit.Scvtf_V_Fixed, typeof(OpCodeSimdShImm64)); SetA64("01011110000xxxxx000000xxxxxxxxxx", InstEmit.Sha1c_V, typeof(OpCodeSimdReg64)); SetA64("0101111000101000000010xxxxxxxxxx", InstEmit.Sha1h_V, typeof(OpCodeSimd64)); SetA64("01011110000xxxxx001000xxxxxxxxxx", InstEmit.Sha1m_V, typeof(OpCodeSimdReg64)); @@ -548,6 +550,8 @@ namespace ChocolArm64 SetA64(">00111100x000011>xxxxxxxxxxxxxxx", InstEmit.Ucvtf_Gp_Fixed, typeof(OpCodeSimdCvt64)); SetA64("011111100x100001110110xxxxxxxxxx", InstEmit.Ucvtf_S, typeof(OpCodeSimd64)); SetA64("0>1011100<100001110110xxxxxxxxxx", InstEmit.Ucvtf_V, typeof(OpCodeSimd64)); + SetA64("0x101111001xxxxx111001xxxxxxxxxx", InstEmit.Ucvtf_V_Fixed, typeof(OpCodeSimdShImm64)); + SetA64("0110111101xxxxxx111001xxxxxxxxxx", InstEmit.Ucvtf_V_Fixed, typeof(OpCodeSimdShImm64)); SetA64("0x101110<<1xxxxx000001xxxxxxxxxx", InstEmit.Uhadd_V, typeof(OpCodeSimdReg64)); SetA64("0x101110<<1xxxxx001001xxxxxxxxxx", InstEmit.Uhsub_V, typeof(OpCodeSimdReg64)); SetA64("0x101110<<1xxxxx011001xxxxxxxxxx", InstEmit.Umax_V, typeof(OpCodeSimdReg64)); diff --git a/Ryujinx.Tests/Cpu/CpuTestSimd.cs b/Ryujinx.Tests/Cpu/CpuTestSimd.cs index fd395da8..df23f2ef 100644 --- a/Ryujinx.Tests/Cpu/CpuTestSimd.cs +++ b/Ryujinx.Tests/Cpu/CpuTestSimd.cs @@ -2082,7 +2082,7 @@ namespace Ryujinx.Tests.Cpu SingleOpcode(opcodes, v0: v0, v1: v1); - CompareAgainstUnicorn(); + CompareAgainstUnicorn(fpTolerances: FpTolerances.UpToOneUlpsD); // unsigned } [Test, Pairwise] [Explicit] @@ -2118,7 +2118,7 @@ namespace Ryujinx.Tests.Cpu SingleOpcode(opcodes, v0: v0, v1: v1); - CompareAgainstUnicorn(); + CompareAgainstUnicorn(fpTolerances: FpTolerances.UpToOneUlpsD); // unsigned } [Test, Pairwise] diff --git a/Ryujinx.Tests/Cpu/CpuTestSimdCvt.cs b/Ryujinx.Tests/Cpu/CpuTestSimdCvt.cs index 60935488..ff8e8027 100644 --- a/Ryujinx.Tests/Cpu/CpuTestSimdCvt.cs +++ b/Ryujinx.Tests/Cpu/CpuTestSimdCvt.cs @@ -288,6 +288,42 @@ namespace Ryujinx.Tests.Cpu }; } + private static uint[] _SU_Cvt_F_Gp_WS_() + { + return new uint[] + { + 0x1E220000u, // SCVTF S0, W0 + 0x1E230000u // UCVTF S0, W0 + }; + } + + private static uint[] _SU_Cvt_F_Gp_WD_() + { + return new uint[] + { + 0x1E620000u, // SCVTF D0, W0 + 0x1E630000u // UCVTF D0, W0 + }; + } + + private static uint[] _SU_Cvt_F_Gp_XS_() + { + return new uint[] + { + 0x9E220000u, // SCVTF S0, X0 + 0x9E230000u // UCVTF S0, X0 + }; + } + + private static uint[] _SU_Cvt_F_Gp_XD_() + { + return new uint[] + { + 0x9E620000u, // SCVTF D0, X0 + 0x9E630000u // UCVTF D0, X0 + }; + } + private static uint[] _SU_Cvt_F_Gp_Fixed_WS_() { return new uint[] @@ -480,6 +516,74 @@ namespace Ryujinx.Tests.Cpu CompareAgainstUnicorn(); } + [Test, Pairwise] [Explicit] + public void SU_Cvt_F_Gp_WS([ValueSource("_SU_Cvt_F_Gp_WS_")] uint opcodes, + [Values(0u)] uint rd, + [Values(1u, 31u)] uint rn, + [ValueSource("_W_")] [Random(RndCnt)] uint wn) + { + opcodes |= ((rn & 31) << 5) | ((rd & 31) << 0); + + uint w31 = TestContext.CurrentContext.Random.NextUInt(); + ulong z = TestContext.CurrentContext.Random.NextULong(); + Vector128 v0 = MakeVectorE0E1(z, z); + + SingleOpcode(opcodes, x1: wn, x31: w31, v0: v0); + + CompareAgainstUnicorn(); + } + + [Test, Pairwise] [Explicit] + public void SU_Cvt_F_Gp_WD([ValueSource("_SU_Cvt_F_Gp_WD_")] uint opcodes, + [Values(0u)] uint rd, + [Values(1u, 31u)] uint rn, + [ValueSource("_W_")] [Random(RndCnt)] uint wn) + { + opcodes |= ((rn & 31) << 5) | ((rd & 31) << 0); + + uint w31 = TestContext.CurrentContext.Random.NextUInt(); + ulong z = TestContext.CurrentContext.Random.NextULong(); + Vector128 v0 = MakeVectorE1(z); + + SingleOpcode(opcodes, x1: wn, x31: w31, v0: v0); + + CompareAgainstUnicorn(); + } + + [Test, Pairwise] [Explicit] + public void SU_Cvt_F_Gp_XS([ValueSource("_SU_Cvt_F_Gp_XS_")] uint opcodes, + [Values(0u)] uint rd, + [Values(1u, 31u)] uint rn, + [ValueSource("_X_")] [Random(RndCnt)] ulong xn) + { + opcodes |= ((rn & 31) << 5) | ((rd & 31) << 0); + + ulong x31 = TestContext.CurrentContext.Random.NextULong(); + ulong z = TestContext.CurrentContext.Random.NextULong(); + Vector128 v0 = MakeVectorE0E1(z, z); + + SingleOpcode(opcodes, x1: xn, x31: x31, v0: v0); + + CompareAgainstUnicorn(); + } + + [Test, Pairwise] [Explicit] + public void SU_Cvt_F_Gp_XD([ValueSource("_SU_Cvt_F_Gp_XD_")] uint opcodes, + [Values(0u)] uint rd, + [Values(1u, 31u)] uint rn, + [ValueSource("_X_")] [Random(RndCnt)] ulong xn) + { + opcodes |= ((rn & 31) << 5) | ((rd & 31) << 0); + + ulong x31 = TestContext.CurrentContext.Random.NextULong(); + ulong z = TestContext.CurrentContext.Random.NextULong(); + Vector128 v0 = MakeVectorE1(z); + + SingleOpcode(opcodes, x1: xn, x31: x31, v0: v0); + + CompareAgainstUnicorn(fpTolerances: FpTolerances.UpToOneUlpsD); // unsigned + } + [Test, Pairwise] [Explicit] public void SU_Cvt_F_Gp_Fixed_WS([ValueSource("_SU_Cvt_F_Gp_Fixed_WS_")] uint opcodes, [Values(0u)] uint rd, @@ -561,7 +665,7 @@ namespace Ryujinx.Tests.Cpu SingleOpcode(opcodes, x1: xn, x31: x31, v0: v0); - CompareAgainstUnicorn(); + CompareAgainstUnicorn(fpTolerances: FpTolerances.UpToOneUlpsD); // unsigned } #endif } diff --git a/Ryujinx.Tests/Cpu/CpuTestSimdShImm.cs b/Ryujinx.Tests/Cpu/CpuTestSimdShImm.cs index cabaac02..c08949a5 100644 --- a/Ryujinx.Tests/Cpu/CpuTestSimdShImm.cs +++ b/Ryujinx.Tests/Cpu/CpuTestSimdShImm.cs @@ -194,6 +194,24 @@ namespace Ryujinx.Tests.Cpu }; } + private static uint[] _SU_Cvt_F_V_Fixed_2S_4S_() + { + return new uint[] + { + 0x0F20E400u, // SCVTF V0.2S, V0.2S, #32 + 0x2F20E400u // UCVTF V0.2S, V0.2S, #32 + }; + } + + private static uint[] _SU_Cvt_F_V_Fixed_2D_() + { + return new uint[] + { + 0x4F40E400u, // SCVTF V0.2D, V0.2D, #64 + 0x6F40E400u // UCVTF V0.2D, V0.2D, #64 + }; + } + private static uint[] _SU_Shll_V_8B8H_16B8H_() { return new uint[] @@ -454,6 +472,50 @@ namespace Ryujinx.Tests.Cpu CompareAgainstUnicorn(); } + [Test, Pairwise] [Explicit] + public void SU_Cvt_F_V_Fixed_2S_4S([ValueSource("_SU_Cvt_F_V_Fixed_2S_4S_")] uint opcodes, + [Values(0u)] uint rd, + [Values(1u, 0u)] uint rn, + [ValueSource("_2S_")] [Random(RndCnt)] ulong z, + [ValueSource("_2S_")] [Random(RndCnt)] ulong a, + [Values(1u, 32u)] [Random(2u, 31u, RndCntFBits)] uint fBits, + [Values(0b0u, 0b1u)] uint q) // <2S, 4S> + { + uint immHb = (64 - fBits) & 0x7F; + + opcodes |= ((rn & 31) << 5) | ((rd & 31) << 0); + opcodes |= (immHb << 16); + opcodes |= ((q & 1) << 30); + + Vector128 v0 = MakeVectorE0E1(z, z); + Vector128 v1 = MakeVectorE0E1(a, a * q); + + SingleOpcode(opcodes, v0: v0, v1: v1); + + CompareAgainstUnicorn(); + } + + [Test, Pairwise] [Explicit] + public void SU_Cvt_F_V_Fixed_2D([ValueSource("_SU_Cvt_F_V_Fixed_2D_")] uint opcodes, + [Values(0u)] uint rd, + [Values(1u, 0u)] uint rn, + [ValueSource("_1D_")] [Random(RndCnt)] ulong z, + [ValueSource("_1D_")] [Random(RndCnt)] ulong a, + [Values(1u, 64u)] [Random(2u, 63u, RndCntFBits)] uint fBits) + { + uint immHb = (128 - fBits) & 0x7F; + + opcodes |= ((rn & 31) << 5) | ((rd & 31) << 0); + opcodes |= (immHb << 16); + + Vector128 v0 = MakeVectorE0E1(z, z); + Vector128 v1 = MakeVectorE0E1(a, a); + + SingleOpcode(opcodes, v0: v0, v1: v1); + + CompareAgainstUnicorn(fpTolerances: FpTolerances.UpToOneUlpsD); // unsigned + } + [Test, Pairwise, Description("SHL , , #")] public void Shl_S_D([Values(0u)] uint rd, [Values(1u, 0u)] uint rn,