From 9bda7b469946dc44e4ee625ea69494da215c57aa Mon Sep 17 00:00:00 2001 From: mageven <62494521+mageven@users.noreply.github.com> Date: Mon, 22 Feb 2021 20:56:13 +0530 Subject: [PATCH] Implement VCNT instruction (#1963) * Implement VCNT based on AArch64 CNT Add tests * Update PTC version * Address LDj's comments * Explicit size in encoding * Tighter tests * Replace SoftFallback with IR helper Co-authored-by: LDj3SNuD <35856442+LDj3SNuD@users.noreply.github.com> * Reduce one BitwiseAnd from IR fallback Based on popcount64b from https://en.wikipedia.org/wiki/Hamming_weight#Efficient_implementation * Rename parameter and add assert Co-authored-by: LDj3SNuD <35856442+LDj3SNuD@users.noreply.github.com> Co-authored-by: LDj3SNuD <35856442+LDj3SNuD@users.noreply.github.com> --- ARMeilleure/Decoders/OpCodeTable.cs | 1 + .../Instructions/InstEmitSimdArithmetic.cs | 2 +- .../Instructions/InstEmitSimdArithmetic32.cs | 28 ++++++++++++++ .../Instructions/InstEmitSimdHelper.cs | 12 ++++++ ARMeilleure/Instructions/InstName.cs | 1 + ARMeilleure/Instructions/SoftFallback.cs | 8 ---- ARMeilleure/Translation/Delegates.cs | 1 - ARMeilleure/Translation/PTC/Ptc.cs | 2 +- Ryujinx.Tests/Cpu/CpuTestSimd32.cs | 37 +++++++++++++++++++ 9 files changed, 81 insertions(+), 11 deletions(-) diff --git a/ARMeilleure/Decoders/OpCodeTable.cs b/ARMeilleure/Decoders/OpCodeTable.cs index 50fb132a..2eabcf15 100644 --- a/ARMeilleure/Decoders/OpCodeTable.cs +++ b/ARMeilleure/Decoders/OpCodeTable.cs @@ -814,6 +814,7 @@ namespace ARMeilleure.Decoders SetA32("111100111x11xx01xxxx0x100xx0xxxx", InstName.Vclt, InstEmit32.Vclt_Z, OpCode32SimdCmpZ.Create); SetA32("<<<<11101x11010xxxxx101x01x0xxxx", InstName.Vcmp, InstEmit32.Vcmp, OpCode32SimdS.Create); SetA32("<<<<11101x11010xxxxx101x11x0xxxx", InstName.Vcmpe, InstEmit32.Vcmpe, OpCode32SimdS.Create); + SetA32("111100111x110000xxxx01010xx0xxxx", InstName.Vcnt, InstEmit32.Vcnt, OpCode32SimdCmpZ.Create); SetA32("<<<<11101x110111xxxx101x11x0xxxx", InstName.Vcvt, InstEmit32.Vcvt_FD, OpCode32SimdS.Create); // FP 32 and 64, scalar. SetA32("<<<<11101x11110xxxxx101x11x0xxxx", InstName.Vcvt, InstEmit32.Vcvt_FI, OpCode32SimdCvtFI.Create); // FP32 to int. SetA32("<<<<11101x111000xxxx101xx1x0xxxx", InstName.Vcvt, InstEmit32.Vcvt_FI, OpCode32SimdCvtFI.Create); // Int to FP32. diff --git a/ARMeilleure/Instructions/InstEmitSimdArithmetic.cs b/ARMeilleure/Instructions/InstEmitSimdArithmetic.cs index deaa6f5a..9c359882 100644 --- a/ARMeilleure/Instructions/InstEmitSimdArithmetic.cs +++ b/ARMeilleure/Instructions/InstEmitSimdArithmetic.cs @@ -289,7 +289,7 @@ namespace ARMeilleure.Instructions } else { - de = context.Call(typeof(SoftFallback).GetMethod(nameof(SoftFallback.CountSetBits8)), ne); + de = EmitCountSetBits8(context, ne); } res = EmitVectorInsert(context, res, de, index, 0); diff --git a/ARMeilleure/Instructions/InstEmitSimdArithmetic32.cs b/ARMeilleure/Instructions/InstEmitSimdArithmetic32.cs index 0fc8c391..0d26a90f 100644 --- a/ARMeilleure/Instructions/InstEmitSimdArithmetic32.cs +++ b/ARMeilleure/Instructions/InstEmitSimdArithmetic32.cs @@ -135,6 +135,34 @@ namespace ARMeilleure.Instructions EmitVectorBinaryWideOpI32(context, (op1, op2) => context.Add(op1, op2), !op.U); } + public static void Vcnt(ArmEmitterContext context) + { + OpCode32SimdCmpZ op = (OpCode32SimdCmpZ)context.CurrOp; + + Operand res = GetVecA32(op.Qd); + + int elems = op.GetBytesCount(); + + for (int index = 0; index < elems; index++) + { + Operand de; + Operand me = EmitVectorExtractZx32(context, op.Qm, op.Im + index, op.Size); + + if (Optimizations.UsePopCnt) + { + de = context.AddIntrinsicInt(Intrinsic.X86Popcnt, me); + } + else + { + de = EmitCountSetBits8(context, me); + } + + res = EmitVectorInsert(context, res, de, op.Id + index, op.Size); + } + + context.Copy(GetVecA32(op.Qd), res); + } + public static void Vdup(ArmEmitterContext context) { OpCode32SimdDupGP op = (OpCode32SimdDupGP)context.CurrOp; diff --git a/ARMeilleure/Instructions/InstEmitSimdHelper.cs b/ARMeilleure/Instructions/InstEmitSimdHelper.cs index da8ccae7..d362ad5e 100644 --- a/ARMeilleure/Instructions/InstEmitSimdHelper.cs +++ b/ARMeilleure/Instructions/InstEmitSimdHelper.cs @@ -234,6 +234,18 @@ namespace ARMeilleure.Instructions throw new ArgumentException($"Invalid rounding mode \"{roundMode}\"."); } + public static Operand EmitCountSetBits8(ArmEmitterContext context, Operand op) // "size" is 8 (SIMD&FP Inst.). + { + Debug.Assert(op.Type == OperandType.I32 || op.Type == OperandType.I64); + + Operand op0 = context.Subtract(op, context.BitwiseAnd(context.ShiftRightUI(op, Const(1)), Const(op.Type, 0x55L))); + + Operand c1 = Const(op.Type, 0x33L); + Operand op1 = context.Add(context.BitwiseAnd(context.ShiftRightUI(op0, Const(2)), c1), context.BitwiseAnd(op0, c1)); + + return context.BitwiseAnd(context.Add(op1, context.ShiftRightUI(op1, Const(4))), Const(op.Type, 0x0fL)); + } + public static void EmitScalarUnaryOpF(ArmEmitterContext context, Intrinsic inst32, Intrinsic inst64) { OpCodeSimd op = (OpCodeSimd)context.CurrOp; diff --git a/ARMeilleure/Instructions/InstName.cs b/ARMeilleure/Instructions/InstName.cs index ca2a63d9..458ecf2f 100644 --- a/ARMeilleure/Instructions/InstName.cs +++ b/ARMeilleure/Instructions/InstName.cs @@ -567,6 +567,7 @@ namespace ARMeilleure.Instructions Vclt, Vcmp, Vcmpe, + Vcnt, Vcvt, Vdiv, Vdup, diff --git a/ARMeilleure/Instructions/SoftFallback.cs b/ARMeilleure/Instructions/SoftFallback.cs index 1d8fa2e2..3ff6483d 100644 --- a/ARMeilleure/Instructions/SoftFallback.cs +++ b/ARMeilleure/Instructions/SoftFallback.cs @@ -846,14 +846,6 @@ namespace ARMeilleure.Instructions return (ulong)count; } - - public static ulong CountSetBits8(ulong value) // "size" is 8 (SIMD&FP Inst.). - { - value = ((value >> 1) & 0x55ul) + (value & 0x55ul); - value = ((value >> 2) & 0x33ul) + (value & 0x33ul); - - return (value >> 4) + (value & 0x0ful); - } #endregion #region "Table" diff --git a/ARMeilleure/Translation/Delegates.cs b/ARMeilleure/Translation/Delegates.cs index 1097ea58..5ad71872 100644 --- a/ARMeilleure/Translation/Delegates.cs +++ b/ARMeilleure/Translation/Delegates.cs @@ -148,7 +148,6 @@ namespace ARMeilleure.Translation SetDelegateInfo(typeof(SoftFallback).GetMethod(nameof(SoftFallback.BinaryUnsignedSatQSub))); SetDelegateInfo(typeof(SoftFallback).GetMethod(nameof(SoftFallback.CountLeadingSigns))); SetDelegateInfo(typeof(SoftFallback).GetMethod(nameof(SoftFallback.CountLeadingZeros))); - SetDelegateInfo(typeof(SoftFallback).GetMethod(nameof(SoftFallback.CountSetBits8))); SetDelegateInfo(typeof(SoftFallback).GetMethod(nameof(SoftFallback.Crc32b))); SetDelegateInfo(typeof(SoftFallback).GetMethod(nameof(SoftFallback.Crc32cb))); SetDelegateInfo(typeof(SoftFallback).GetMethod(nameof(SoftFallback.Crc32ch))); diff --git a/ARMeilleure/Translation/PTC/Ptc.cs b/ARMeilleure/Translation/PTC/Ptc.cs index 163e3efa..01d0b236 100644 --- a/ARMeilleure/Translation/PTC/Ptc.cs +++ b/ARMeilleure/Translation/PTC/Ptc.cs @@ -26,7 +26,7 @@ namespace ARMeilleure.Translation.PTC { private const string HeaderMagicString = "PTChd\0\0\0"; - private const uint InternalVersion = 1968; //! To be incremented manually for each change to the ARMeilleure project. + private const int InternalVersion = 1963; //! To be incremented manually for each change to the ARMeilleure project. private const string ActualDir = "0"; private const string BackupDir = "1"; diff --git a/Ryujinx.Tests/Cpu/CpuTestSimd32.cs b/Ryujinx.Tests/Cpu/CpuTestSimd32.cs index 45c42ddf..4b068dda 100644 --- a/Ryujinx.Tests/Cpu/CpuTestSimd32.cs +++ b/Ryujinx.Tests/Cpu/CpuTestSimd32.cs @@ -154,6 +154,15 @@ namespace Ryujinx.Tests.Cpu yield return rnd2; } } + + private static IEnumerable _GenPopCnt8B_() + { + for (ulong cnt = 0ul; cnt <= 255ul; cnt++) + { + yield return (cnt << 56) | (cnt << 48) | (cnt << 40) | (cnt << 32) | + (cnt << 24) | (cnt << 16) | (cnt << 08) | cnt; + } + } #endregion private const int RndCnt = 2; @@ -217,6 +226,34 @@ namespace Ryujinx.Tests.Cpu CompareAgainstUnicorn(); } + + [Test, Pairwise, Description("VCNT.8 D0, D0 | VCNT.8 Q0, Q0")] + public void Vcnt([Values(0u, 1u)] uint rd, + [Values(0u, 1u)] uint rm, + [ValueSource(nameof(_GenPopCnt8B_))] [Random(RndCnt)] ulong d0, + [Values] bool q) + { + ulong d1 = ~d0; // It's expensive to have a second generator. + + uint opcode = 0xf3b00500u; // VCNT.8 D0, D0 + + if (q) + { + opcode |= 1u << 6; + + rd &= ~1u; + rm &= ~1u; + } + + opcode |= ((rd & 0xf) << 12) | ((rd & 0x10) << 18); + opcode |= ((rm & 0xf) << 0) | ((rm & 0x10) << 1); + + V128 v0 = MakeVectorE0E1(d0, d1); + + SingleOpcode(opcode, v0: v0); + + CompareAgainstUnicorn(); + } #endif } }