Implement VCNT instruction (#1963)
* Implement VCNT based on AArch64 CNT Add tests * Update PTC version * Address LDj's comments * Explicit size in encoding * Tighter tests * Replace SoftFallback with IR helper Co-authored-by: LDj3SNuD <35856442+LDj3SNuD@users.noreply.github.com> * Reduce one BitwiseAnd from IR fallback Based on popcount64b from https://en.wikipedia.org/wiki/Hamming_weight#Efficient_implementation * Rename parameter and add assert Co-authored-by: LDj3SNuD <35856442+LDj3SNuD@users.noreply.github.com> Co-authored-by: LDj3SNuD <35856442+LDj3SNuD@users.noreply.github.com>
This commit is contained in:
parent
dc0adb533d
commit
9bda7b4699
9 changed files with 81 additions and 11 deletions
|
@ -814,6 +814,7 @@ namespace ARMeilleure.Decoders
|
|||
SetA32("111100111x11xx01xxxx0x100xx0xxxx", InstName.Vclt, InstEmit32.Vclt_Z, OpCode32SimdCmpZ.Create);
|
||||
SetA32("<<<<11101x11010xxxxx101x01x0xxxx", InstName.Vcmp, InstEmit32.Vcmp, OpCode32SimdS.Create);
|
||||
SetA32("<<<<11101x11010xxxxx101x11x0xxxx", InstName.Vcmpe, InstEmit32.Vcmpe, OpCode32SimdS.Create);
|
||||
SetA32("111100111x110000xxxx01010xx0xxxx", InstName.Vcnt, InstEmit32.Vcnt, OpCode32SimdCmpZ.Create);
|
||||
SetA32("<<<<11101x110111xxxx101x11x0xxxx", InstName.Vcvt, InstEmit32.Vcvt_FD, OpCode32SimdS.Create); // FP 32 and 64, scalar.
|
||||
SetA32("<<<<11101x11110xxxxx101x11x0xxxx", InstName.Vcvt, InstEmit32.Vcvt_FI, OpCode32SimdCvtFI.Create); // FP32 to int.
|
||||
SetA32("<<<<11101x111000xxxx101xx1x0xxxx", InstName.Vcvt, InstEmit32.Vcvt_FI, OpCode32SimdCvtFI.Create); // Int to FP32.
|
||||
|
|
|
@ -289,7 +289,7 @@ namespace ARMeilleure.Instructions
|
|||
}
|
||||
else
|
||||
{
|
||||
de = context.Call(typeof(SoftFallback).GetMethod(nameof(SoftFallback.CountSetBits8)), ne);
|
||||
de = EmitCountSetBits8(context, ne);
|
||||
}
|
||||
|
||||
res = EmitVectorInsert(context, res, de, index, 0);
|
||||
|
|
|
@ -135,6 +135,34 @@ namespace ARMeilleure.Instructions
|
|||
EmitVectorBinaryWideOpI32(context, (op1, op2) => context.Add(op1, op2), !op.U);
|
||||
}
|
||||
|
||||
public static void Vcnt(ArmEmitterContext context)
|
||||
{
|
||||
OpCode32SimdCmpZ op = (OpCode32SimdCmpZ)context.CurrOp;
|
||||
|
||||
Operand res = GetVecA32(op.Qd);
|
||||
|
||||
int elems = op.GetBytesCount();
|
||||
|
||||
for (int index = 0; index < elems; index++)
|
||||
{
|
||||
Operand de;
|
||||
Operand me = EmitVectorExtractZx32(context, op.Qm, op.Im + index, op.Size);
|
||||
|
||||
if (Optimizations.UsePopCnt)
|
||||
{
|
||||
de = context.AddIntrinsicInt(Intrinsic.X86Popcnt, me);
|
||||
}
|
||||
else
|
||||
{
|
||||
de = EmitCountSetBits8(context, me);
|
||||
}
|
||||
|
||||
res = EmitVectorInsert(context, res, de, op.Id + index, op.Size);
|
||||
}
|
||||
|
||||
context.Copy(GetVecA32(op.Qd), res);
|
||||
}
|
||||
|
||||
public static void Vdup(ArmEmitterContext context)
|
||||
{
|
||||
OpCode32SimdDupGP op = (OpCode32SimdDupGP)context.CurrOp;
|
||||
|
|
|
@ -234,6 +234,18 @@ namespace ARMeilleure.Instructions
|
|||
throw new ArgumentException($"Invalid rounding mode \"{roundMode}\".");
|
||||
}
|
||||
|
||||
public static Operand EmitCountSetBits8(ArmEmitterContext context, Operand op) // "size" is 8 (SIMD&FP Inst.).
|
||||
{
|
||||
Debug.Assert(op.Type == OperandType.I32 || op.Type == OperandType.I64);
|
||||
|
||||
Operand op0 = context.Subtract(op, context.BitwiseAnd(context.ShiftRightUI(op, Const(1)), Const(op.Type, 0x55L)));
|
||||
|
||||
Operand c1 = Const(op.Type, 0x33L);
|
||||
Operand op1 = context.Add(context.BitwiseAnd(context.ShiftRightUI(op0, Const(2)), c1), context.BitwiseAnd(op0, c1));
|
||||
|
||||
return context.BitwiseAnd(context.Add(op1, context.ShiftRightUI(op1, Const(4))), Const(op.Type, 0x0fL));
|
||||
}
|
||||
|
||||
public static void EmitScalarUnaryOpF(ArmEmitterContext context, Intrinsic inst32, Intrinsic inst64)
|
||||
{
|
||||
OpCodeSimd op = (OpCodeSimd)context.CurrOp;
|
||||
|
|
|
@ -567,6 +567,7 @@ namespace ARMeilleure.Instructions
|
|||
Vclt,
|
||||
Vcmp,
|
||||
Vcmpe,
|
||||
Vcnt,
|
||||
Vcvt,
|
||||
Vdiv,
|
||||
Vdup,
|
||||
|
|
|
@ -846,14 +846,6 @@ namespace ARMeilleure.Instructions
|
|||
|
||||
return (ulong)count;
|
||||
}
|
||||
|
||||
public static ulong CountSetBits8(ulong value) // "size" is 8 (SIMD&FP Inst.).
|
||||
{
|
||||
value = ((value >> 1) & 0x55ul) + (value & 0x55ul);
|
||||
value = ((value >> 2) & 0x33ul) + (value & 0x33ul);
|
||||
|
||||
return (value >> 4) + (value & 0x0ful);
|
||||
}
|
||||
#endregion
|
||||
|
||||
#region "Table"
|
||||
|
|
|
@ -148,7 +148,6 @@ namespace ARMeilleure.Translation
|
|||
SetDelegateInfo(typeof(SoftFallback).GetMethod(nameof(SoftFallback.BinaryUnsignedSatQSub)));
|
||||
SetDelegateInfo(typeof(SoftFallback).GetMethod(nameof(SoftFallback.CountLeadingSigns)));
|
||||
SetDelegateInfo(typeof(SoftFallback).GetMethod(nameof(SoftFallback.CountLeadingZeros)));
|
||||
SetDelegateInfo(typeof(SoftFallback).GetMethod(nameof(SoftFallback.CountSetBits8)));
|
||||
SetDelegateInfo(typeof(SoftFallback).GetMethod(nameof(SoftFallback.Crc32b)));
|
||||
SetDelegateInfo(typeof(SoftFallback).GetMethod(nameof(SoftFallback.Crc32cb)));
|
||||
SetDelegateInfo(typeof(SoftFallback).GetMethod(nameof(SoftFallback.Crc32ch)));
|
||||
|
|
|
@ -26,7 +26,7 @@ namespace ARMeilleure.Translation.PTC
|
|||
{
|
||||
private const string HeaderMagicString = "PTChd\0\0\0";
|
||||
|
||||
private const uint InternalVersion = 1968; //! To be incremented manually for each change to the ARMeilleure project.
|
||||
private const int InternalVersion = 1963; //! To be incremented manually for each change to the ARMeilleure project.
|
||||
|
||||
private const string ActualDir = "0";
|
||||
private const string BackupDir = "1";
|
||||
|
|
|
@ -154,6 +154,15 @@ namespace Ryujinx.Tests.Cpu
|
|||
yield return rnd2;
|
||||
}
|
||||
}
|
||||
|
||||
private static IEnumerable<ulong> _GenPopCnt8B_()
|
||||
{
|
||||
for (ulong cnt = 0ul; cnt <= 255ul; cnt++)
|
||||
{
|
||||
yield return (cnt << 56) | (cnt << 48) | (cnt << 40) | (cnt << 32) |
|
||||
(cnt << 24) | (cnt << 16) | (cnt << 08) | cnt;
|
||||
}
|
||||
}
|
||||
#endregion
|
||||
|
||||
private const int RndCnt = 2;
|
||||
|
@ -217,6 +226,34 @@ namespace Ryujinx.Tests.Cpu
|
|||
|
||||
CompareAgainstUnicorn();
|
||||
}
|
||||
|
||||
[Test, Pairwise, Description("VCNT.8 D0, D0 | VCNT.8 Q0, Q0")]
|
||||
public void Vcnt([Values(0u, 1u)] uint rd,
|
||||
[Values(0u, 1u)] uint rm,
|
||||
[ValueSource(nameof(_GenPopCnt8B_))] [Random(RndCnt)] ulong d0,
|
||||
[Values] bool q)
|
||||
{
|
||||
ulong d1 = ~d0; // It's expensive to have a second generator.
|
||||
|
||||
uint opcode = 0xf3b00500u; // VCNT.8 D0, D0
|
||||
|
||||
if (q)
|
||||
{
|
||||
opcode |= 1u << 6;
|
||||
|
||||
rd &= ~1u;
|
||||
rm &= ~1u;
|
||||
}
|
||||
|
||||
opcode |= ((rd & 0xf) << 12) | ((rd & 0x10) << 18);
|
||||
opcode |= ((rm & 0xf) << 0) | ((rm & 0x10) << 1);
|
||||
|
||||
V128 v0 = MakeVectorE0E1(d0, d1);
|
||||
|
||||
SingleOpcode(opcode, v0: v0);
|
||||
|
||||
CompareAgainstUnicorn();
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
|
Reference in a new issue