diff --git a/ARMeilleure/Decoders/OpCodeTable.cs b/ARMeilleure/Decoders/OpCodeTable.cs index 55cd5cb6..2fa7702d 100644 --- a/ARMeilleure/Decoders/OpCodeTable.cs +++ b/ARMeilleure/Decoders/OpCodeTable.cs @@ -437,6 +437,7 @@ namespace ARMeilleure.Decoders SetA64("0x101110<<100001001110xxxxxxxxxx", InstName.Shll_V, InstEmit.Shll_V, typeof(OpCodeSimd)); SetA64("0x00111100>>>xxx100001xxxxxxxxxx", InstName.Shrn_V, InstEmit.Shrn_V, typeof(OpCodeSimdShImm)); SetA64("0x001110<<1xxxxx001001xxxxxxxxxx", InstName.Shsub_V, InstEmit.Shsub_V, typeof(OpCodeSimdReg)); + SetA64("0111111101xxxxxx010101xxxxxxxxxx", InstName.Sli_S, InstEmit.Sli_S, typeof(OpCodeSimdShImm)); SetA64("0x10111100>>>xxx010101xxxxxxxxxx", InstName.Sli_V, InstEmit.Sli_V, typeof(OpCodeSimdShImm)); SetA64("0110111101xxxxxx010101xxxxxxxxxx", InstName.Sli_V, InstEmit.Sli_V, typeof(OpCodeSimdShImm)); SetA64("0x001110<<1xxxxx011001xxxxxxxxxx", InstName.Smax_V, InstEmit.Smax_V, typeof(OpCodeSimdReg)); @@ -485,6 +486,9 @@ namespace ARMeilleure.Decoders SetA64("01111110<<100001001010xxxxxxxxxx", InstName.Sqxtun_S, InstEmit.Sqxtun_S, typeof(OpCodeSimd)); SetA64("0x101110<<100001001010xxxxxxxxxx", InstName.Sqxtun_V, InstEmit.Sqxtun_V, typeof(OpCodeSimd)); SetA64("0x001110<<1xxxxx000101xxxxxxxxxx", InstName.Srhadd_V, InstEmit.Srhadd_V, typeof(OpCodeSimdReg)); + SetA64("0111111101xxxxxx010001xxxxxxxxxx", InstName.Sri_S, InstEmit.Sri_S, typeof(OpCodeSimdShImm)); + SetA64("0x10111100>>>xxx010001xxxxxxxxxx", InstName.Sri_V, InstEmit.Sri_V, typeof(OpCodeSimdShImm)); + SetA64("0110111101xxxxxx010001xxxxxxxxxx", InstName.Sri_V, InstEmit.Sri_V, typeof(OpCodeSimdShImm)); SetA64("0>001110<<1xxxxx010101xxxxxxxxxx", InstName.Srshl_V, InstEmit.Srshl_V, typeof(OpCodeSimdReg)); SetA64("0101111101xxxxxx001001xxxxxxxxxx", InstName.Srshr_S, InstEmit.Srshr_S, typeof(OpCodeSimdShImm)); SetA64("0x00111100>>>xxx001001xxxxxxxxxx", InstName.Srshr_V, InstEmit.Srshr_V, typeof(OpCodeSimdShImm)); diff --git a/ARMeilleure/Instructions/InstEmitSimdShift.cs b/ARMeilleure/Instructions/InstEmitSimdShift.cs index 1aae491d..17f43c81 100644 --- a/ARMeilleure/Instructions/InstEmitSimdShift.cs +++ b/ARMeilleure/Instructions/InstEmitSimdShift.cs @@ -22,6 +22,11 @@ namespace ARMeilleure.Instructions 13L << 56 | 12L << 48 | 09L << 40 | 08L << 32 | 05L << 24 | 04L << 16 | 01L << 8 | 00L << 0, 11L << 56 | 10L << 48 | 09L << 40 | 08L << 32 | 03L << 24 | 02L << 16 | 01L << 8 | 00L << 0 }; + + private static readonly long[] _masks_SliSri = new long[] // Replication masks. + { + 0x0101010101010101L, 0x0001000100010001L, 0x0000000100000001L, 0x0000000000000001L + }; #endregion public static void Rshrn_V(ArmEmitterContext context) @@ -66,7 +71,7 @@ namespace ARMeilleure.Instructions res = context.AddIntrinsic(movInst, dLow, res); - context.Copy(GetVec(op.Rd), res); + context.Copy(d, res); } else { @@ -106,7 +111,7 @@ namespace ARMeilleure.Instructions } else { - EmitVectorUnaryOpZx(context, (op1) => context.ShiftLeft(op1, Const(shift))); + EmitVectorUnaryOpZx(context, (op1) => context.ShiftLeft(op1, Const(shift))); } } @@ -149,8 +154,6 @@ namespace ARMeilleure.Instructions int shift = GetImmShr(op); - long roundConst = 1L << (shift - 1); - Operand d = GetVec(op.Rd); Operand n = GetVec(op.Rn); @@ -170,7 +173,7 @@ namespace ARMeilleure.Instructions res = context.AddIntrinsic(movInst, dLow, res); - context.Copy(GetVec(op.Rd), res); + context.Copy(d, res); } else { @@ -178,34 +181,14 @@ namespace ARMeilleure.Instructions } } + public static void Sli_S(ArmEmitterContext context) + { + EmitSli(context, scalar: true); + } + public static void Sli_V(ArmEmitterContext context) { - OpCodeSimdShImm op = (OpCodeSimdShImm)context.CurrOp; - - Operand res = context.VectorZero(); - - int elems = op.GetBytesCount() >> op.Size; - - int shift = GetImmShl(op); - - ulong mask = shift != 0 ? ulong.MaxValue >> (64 - shift) : 0; - - for (int index = 0; index < elems; index++) - { - Operand ne = EmitVectorExtractZx(context, op.Rn, index, op.Size); - - Operand neShifted = context.ShiftLeft(ne, Const(shift)); - - Operand de = EmitVectorExtractZx(context, op.Rd, index, op.Size); - - Operand deMasked = context.BitwiseAnd(de, Const(mask)); - - Operand e = context.BitwiseOr(neShifted, deMasked); - - res = EmitVectorInsert(context, res, e, index, op.Size); - } - - context.Copy(GetVec(op.Rd), res); + EmitSli(context, scalar: false); } public static void Sqrshl_V(ArmEmitterContext context) @@ -290,6 +273,16 @@ namespace ARMeilleure.Instructions EmitShrImmSaturatingNarrowOp(context, ShrImmSaturatingNarrowFlags.VectorSxZx); } + public static void Sri_S(ArmEmitterContext context) + { + EmitSri(context, scalar: true); + } + + public static void Sri_V(ArmEmitterContext context) + { + EmitSri(context, scalar: false); + } + public static void Srshl_V(ArmEmitterContext context) { OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp; @@ -395,7 +388,7 @@ namespace ARMeilleure.Instructions res = context.VectorZeroUpper64(res); } - context.Copy(GetVec(op.Rd), res); + context.Copy(d, res); } else { @@ -690,7 +683,7 @@ namespace ARMeilleure.Instructions res = context.VectorZeroUpper64(res); } - context.Copy(GetVec(op.Rd), res); + context.Copy(d, res); } else { @@ -1053,5 +1046,116 @@ namespace ARMeilleure.Instructions context.Copy(GetVec(op.Rd), res); } + + private static void EmitSli(ArmEmitterContext context, bool scalar) + { + OpCodeSimdShImm op = (OpCodeSimdShImm)context.CurrOp; + + int shift = GetImmShl(op); + + ulong mask = shift != 0 ? ulong.MaxValue >> (64 - shift) : 0UL; + + if (Optimizations.UseSse2 && op.Size > 0) + { + Operand d = GetVec(op.Rd); + Operand n = GetVec(op.Rn); + + Intrinsic sllInst = X86PsllInstruction[op.Size]; + + Operand nShifted = context.AddIntrinsic(sllInst, n, Const(shift)); + + Operand dMask = X86GetAllElements(context, (long)mask * _masks_SliSri[op.Size]); + + Operand dMasked = context.AddIntrinsic(Intrinsic.X86Pand, d, dMask); + + Operand res = context.AddIntrinsic(Intrinsic.X86Por, nShifted, dMasked); + + if ((op.RegisterSize == RegisterSize.Simd64) || scalar) + { + res = context.VectorZeroUpper64(res); + } + + context.Copy(d, res); + } + else + { + Operand res = context.VectorZero(); + + int elems = !scalar ? op.GetBytesCount() >> op.Size : 1; + + for (int index = 0; index < elems; index++) + { + Operand ne = EmitVectorExtractZx(context, op.Rn, index, op.Size); + + Operand neShifted = context.ShiftLeft(ne, Const(shift)); + + Operand de = EmitVectorExtractZx(context, op.Rd, index, op.Size); + + Operand deMasked = context.BitwiseAnd(de, Const(mask)); + + Operand e = context.BitwiseOr(neShifted, deMasked); + + res = EmitVectorInsert(context, res, e, index, op.Size); + } + + context.Copy(GetVec(op.Rd), res); + } + } + + private static void EmitSri(ArmEmitterContext context, bool scalar) + { + OpCodeSimdShImm op = (OpCodeSimdShImm)context.CurrOp; + + int shift = GetImmShr(op); + int eSize = 8 << op.Size; + + ulong mask = (ulong.MaxValue << (eSize - shift)) & (ulong.MaxValue >> (64 - eSize)); + + if (Optimizations.UseSse2 && op.Size > 0) + { + Operand d = GetVec(op.Rd); + Operand n = GetVec(op.Rn); + + Intrinsic srlInst = X86PsrlInstruction[op.Size]; + + Operand nShifted = context.AddIntrinsic(srlInst, n, Const(shift)); + + Operand dMask = X86GetAllElements(context, (long)mask * _masks_SliSri[op.Size]); + + Operand dMasked = context.AddIntrinsic(Intrinsic.X86Pand, d, dMask); + + Operand res = context.AddIntrinsic(Intrinsic.X86Por, nShifted, dMasked); + + if ((op.RegisterSize == RegisterSize.Simd64) || scalar) + { + res = context.VectorZeroUpper64(res); + } + + context.Copy(d, res); + } + else + { + Operand res = context.VectorZero(); + + int elems = !scalar ? op.GetBytesCount() >> op.Size : 1; + + for (int index = 0; index < elems; index++) + { + Operand ne = EmitVectorExtractZx(context, op.Rn, index, op.Size); + + Operand neShifted = shift != 64 ? context.ShiftRightUI(ne, Const(shift)) : Const(0UL); + + Operand de = EmitVectorExtractZx(context, op.Rd, index, op.Size); + + Operand deMasked = context.BitwiseAnd(de, Const(mask)); + + Operand e = context.BitwiseOr(neShifted, deMasked); + + res = EmitVectorInsert(context, res, e, index, op.Size); + } + + context.Copy(GetVec(op.Rd), res); + } + } } } diff --git a/ARMeilleure/Instructions/InstName.cs b/ARMeilleure/Instructions/InstName.cs index e03ab616..c81484a6 100644 --- a/ARMeilleure/Instructions/InstName.cs +++ b/ARMeilleure/Instructions/InstName.cs @@ -313,6 +313,7 @@ namespace ARMeilleure.Instructions Shll_V, Shrn_V, Shsub_V, + Sli_S, Sli_V, Smax_V, Smaxp_V, @@ -354,6 +355,8 @@ namespace ARMeilleure.Instructions Sqxtun_S, Sqxtun_V, Srhadd_V, + Sri_S, + Sri_V, Srshl_V, Srshr_S, Srshr_V, diff --git a/Ryujinx.Tests/Cpu/CpuTestSimdShImm.cs b/Ryujinx.Tests/Cpu/CpuTestSimdShImm.cs index fbbc9f9f..0f1b0dac 100644 --- a/Ryujinx.Tests/Cpu/CpuTestSimdShImm.cs +++ b/Ryujinx.Tests/Cpu/CpuTestSimdShImm.cs @@ -218,7 +218,7 @@ namespace Ryujinx.Tests.Cpu return new uint[] { 0x5F405400u, // SHL D0, D0, #0 - //0x7F405400u // SLI D0, D0, #0 + 0x7F405400u // SLI D0, D0, #0 }; } @@ -285,10 +285,11 @@ namespace Ryujinx.Tests.Cpu }; } - private static uint[] _ShrImm_S_D_() + private static uint[] _ShrImm_Sri_S_D_() { return new uint[] { + 0x7F404400u, // SRI D0, D0, #64 0x5F402400u, // SRSHR D0, D0, #64 0x5F403400u, // SRSRA D0, D0, #64 0x5F400400u, // SSHR D0, D0, #64 @@ -300,10 +301,11 @@ namespace Ryujinx.Tests.Cpu }; } - private static uint[] _ShrImm_V_8B_16B_() + private static uint[] _ShrImm_Sri_V_8B_16B_() { return new uint[] { + 0x2F084400u, // SRI V0.8B, V0.8B, #8 0x0F082400u, // SRSHR V0.8B, V0.8B, #8 0x0F083400u, // SRSRA V0.8B, V0.8B, #8 0x0F080400u, // SSHR V0.8B, V0.8B, #8 @@ -315,10 +317,11 @@ namespace Ryujinx.Tests.Cpu }; } - private static uint[] _ShrImm_V_4H_8H_() + private static uint[] _ShrImm_Sri_V_4H_8H_() { return new uint[] { + 0x2F104400u, // SRI V0.4H, V0.4H, #16 0x0F102400u, // SRSHR V0.4H, V0.4H, #16 0x0F103400u, // SRSRA V0.4H, V0.4H, #16 0x0F100400u, // SSHR V0.4H, V0.4H, #16 @@ -330,10 +333,11 @@ namespace Ryujinx.Tests.Cpu }; } - private static uint[] _ShrImm_V_2S_4S_() + private static uint[] _ShrImm_Sri_V_2S_4S_() { return new uint[] { + 0x2F204400u, // SRI V0.2S, V0.2S, #32 0x0F202400u, // SRSHR V0.2S, V0.2S, #32 0x0F203400u, // SRSRA V0.2S, V0.2S, #32 0x0F200400u, // SSHR V0.2S, V0.2S, #32 @@ -345,10 +349,11 @@ namespace Ryujinx.Tests.Cpu }; } - private static uint[] _ShrImm_V_2D_() + private static uint[] _ShrImm_Sri_V_2D_() { return new uint[] { + 0x6F404400u, // SRI V0.2D, V0.2D, #64 0x4F402400u, // SRSHR V0.2D, V0.2D, #64 0x4F403400u, // SRSRA V0.2D, V0.2D, #64 0x4F400400u, // SSHR V0.2D, V0.2D, #64 @@ -743,12 +748,12 @@ namespace Ryujinx.Tests.Cpu } [Test, Pairwise] - public void ShrImm_S_D([ValueSource("_ShrImm_S_D_")] uint opcodes, - [Values(0u)] uint rd, - [Values(1u, 0u)] uint rn, - [ValueSource("_1D_")] [Random(RndCnt)] ulong z, - [ValueSource("_1D_")] [Random(RndCnt)] ulong a, - [Values(1u, 64u)] [Random(2u, 63u, RndCntShift)] uint shift) + public void ShrImm_Sri_S_D([ValueSource("_ShrImm_Sri_S_D_")] uint opcodes, + [Values(0u)] uint rd, + [Values(1u, 0u)] uint rn, + [ValueSource("_1D_")] [Random(RndCnt)] ulong z, + [ValueSource("_1D_")] [Random(RndCnt)] ulong a, + [Values(1u, 64u)] [Random(2u, 63u, RndCntShift)] uint shift) { uint immHb = (128 - shift) & 0x7F; @@ -764,13 +769,13 @@ namespace Ryujinx.Tests.Cpu } [Test, Pairwise] - public void ShrImm_V_8B_16B([ValueSource("_ShrImm_V_8B_16B_")] uint opcodes, - [Values(0u)] uint rd, - [Values(1u, 0u)] uint rn, - [ValueSource("_8B_")] [Random(RndCnt)] ulong z, - [ValueSource("_8B_")] [Random(RndCnt)] ulong a, - [Values(1u, 8u)] [Random(2u, 7u, RndCntShift)] uint shift, - [Values(0b0u, 0b1u)] uint q) // <8B, 16B> + public void ShrImm_Sri_V_8B_16B([ValueSource("_ShrImm_Sri_V_8B_16B_")] uint opcodes, + [Values(0u)] uint rd, + [Values(1u, 0u)] uint rn, + [ValueSource("_8B_")] [Random(RndCnt)] ulong z, + [ValueSource("_8B_")] [Random(RndCnt)] ulong a, + [Values(1u, 8u)] [Random(2u, 7u, RndCntShift)] uint shift, + [Values(0b0u, 0b1u)] uint q) // <8B, 16B> { uint immHb = (16 - shift) & 0x7F; @@ -787,13 +792,13 @@ namespace Ryujinx.Tests.Cpu } [Test, Pairwise] - public void ShrImm_V_4H_8H([ValueSource("_ShrImm_V_4H_8H_")] uint opcodes, - [Values(0u)] uint rd, - [Values(1u, 0u)] uint rn, - [ValueSource("_4H_")] [Random(RndCnt)] ulong z, - [ValueSource("_4H_")] [Random(RndCnt)] ulong a, - [Values(1u, 16u)] [Random(2u, 15u, RndCntShift)] uint shift, - [Values(0b0u, 0b1u)] uint q) // <4H, 8H> + public void ShrImm_Sri_V_4H_8H([ValueSource("_ShrImm_Sri_V_4H_8H_")] uint opcodes, + [Values(0u)] uint rd, + [Values(1u, 0u)] uint rn, + [ValueSource("_4H_")] [Random(RndCnt)] ulong z, + [ValueSource("_4H_")] [Random(RndCnt)] ulong a, + [Values(1u, 16u)] [Random(2u, 15u, RndCntShift)] uint shift, + [Values(0b0u, 0b1u)] uint q) // <4H, 8H> { uint immHb = (32 - shift) & 0x7F; @@ -810,13 +815,13 @@ namespace Ryujinx.Tests.Cpu } [Test, Pairwise] - public void ShrImm_V_2S_4S([ValueSource("_ShrImm_V_2S_4S_")] uint opcodes, - [Values(0u)] uint rd, - [Values(1u, 0u)] uint rn, - [ValueSource("_2S_")] [Random(RndCnt)] ulong z, - [ValueSource("_2S_")] [Random(RndCnt)] ulong a, - [Values(1u, 32u)] [Random(2u, 31u, RndCntShift)] uint shift, - [Values(0b0u, 0b1u)] uint q) // <2S, 4S> + public void ShrImm_Sri_V_2S_4S([ValueSource("_ShrImm_Sri_V_2S_4S_")] uint opcodes, + [Values(0u)] uint rd, + [Values(1u, 0u)] uint rn, + [ValueSource("_2S_")] [Random(RndCnt)] ulong z, + [ValueSource("_2S_")] [Random(RndCnt)] ulong a, + [Values(1u, 32u)] [Random(2u, 31u, RndCntShift)] uint shift, + [Values(0b0u, 0b1u)] uint q) // <2S, 4S> { uint immHb = (64 - shift) & 0x7F; @@ -833,12 +838,12 @@ namespace Ryujinx.Tests.Cpu } [Test, Pairwise] - public void ShrImm_V_2D([ValueSource("_ShrImm_V_2D_")] uint opcodes, - [Values(0u)] uint rd, - [Values(1u, 0u)] uint rn, - [ValueSource("_1D_")] [Random(RndCnt)] ulong z, - [ValueSource("_1D_")] [Random(RndCnt)] ulong a, - [Values(1u, 64u)] [Random(2u, 63u, RndCntShift)] uint shift) + public void ShrImm_Sri_V_2D([ValueSource("_ShrImm_Sri_V_2D_")] uint opcodes, + [Values(0u)] uint rd, + [Values(1u, 0u)] uint rn, + [ValueSource("_1D_")] [Random(RndCnt)] ulong z, + [ValueSource("_1D_")] [Random(RndCnt)] ulong a, + [Values(1u, 64u)] [Random(2u, 63u, RndCntShift)] uint shift) { uint immHb = (128 - shift) & 0x7F;