CPU: Implement VFMA (Vector) (#1762)
* Implement VFMA.F64 * Simplify switch * Simplify FMA Instructions into their own IntrinsicType. * Remove whitespace * Fix indentation * Change tests for Vfnms -- disable inf / nan * Move args up, not description ;) * Implementation Complete. All Tests Pass (Slow / Fast Path) * Move location of function in assembler + test updates. * Shift params upwards * Remove unused function * Update PTC version. * Add comments / re-oreder opcode table. * Remove whitespace * Fix nit * Fix nit. * Fix whitespace * Wrong opcode was used by a bad merge. * Addressed rip's comments.
This commit is contained in:
parent
47ba81c661
commit
3332b29f01
5 changed files with 80 additions and 4 deletions
|
@ -273,10 +273,10 @@ namespace ARMeilleure.CodeGen.X86
|
||||||
Add(X86Instruction.Vblendvps, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f3a4a, InstructionFlags.Vex | InstructionFlags.Prefix66));
|
Add(X86Instruction.Vblendvps, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f3a4a, InstructionFlags.Vex | InstructionFlags.Prefix66));
|
||||||
Add(X86Instruction.Vcvtph2ps, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f3813, InstructionFlags.Vex | InstructionFlags.Prefix66));
|
Add(X86Instruction.Vcvtph2ps, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f3813, InstructionFlags.Vex | InstructionFlags.Prefix66));
|
||||||
Add(X86Instruction.Vcvtps2ph, new InstructionInfo(0x000f3a1d, BadOp, BadOp, BadOp, BadOp, InstructionFlags.Vex | InstructionFlags.Prefix66));
|
Add(X86Instruction.Vcvtps2ph, new InstructionInfo(0x000f3a1d, BadOp, BadOp, BadOp, BadOp, InstructionFlags.Vex | InstructionFlags.Prefix66));
|
||||||
Add(X86Instruction.Vfmadd231ps, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f38bc, InstructionFlags.Vex | InstructionFlags.Prefix66));
|
Add(X86Instruction.Vfmadd231ps, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f38b8, InstructionFlags.Vex | InstructionFlags.Prefix66));
|
||||||
Add(X86Instruction.Vfmadd231pd, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f38bc, InstructionFlags.Vex | InstructionFlags.Prefix66 | InstructionFlags.RexW));
|
Add(X86Instruction.Vfmadd231pd, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f38b8, InstructionFlags.Vex | InstructionFlags.Prefix66 | InstructionFlags.RexW));
|
||||||
Add(X86Instruction.Vfmadd231ss, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f38bd, InstructionFlags.Vex | InstructionFlags.Prefix66));
|
Add(X86Instruction.Vfmadd231ss, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f38b9, InstructionFlags.Vex | InstructionFlags.Prefix66));
|
||||||
Add(X86Instruction.Vfmadd231sd, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f38bd, InstructionFlags.Vex | InstructionFlags.Prefix66 | InstructionFlags.RexW));
|
Add(X86Instruction.Vfmadd231sd, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f38b9, InstructionFlags.Vex | InstructionFlags.Prefix66 | InstructionFlags.RexW));
|
||||||
Add(X86Instruction.Vfmsub231ps, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f38ba, InstructionFlags.Vex | InstructionFlags.Prefix66));
|
Add(X86Instruction.Vfmsub231ps, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f38ba, InstructionFlags.Vex | InstructionFlags.Prefix66));
|
||||||
Add(X86Instruction.Vfmsub231pd, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f38ba, InstructionFlags.Vex | InstructionFlags.Prefix66 | InstructionFlags.RexW));
|
Add(X86Instruction.Vfmsub231pd, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f38ba, InstructionFlags.Vex | InstructionFlags.Prefix66 | InstructionFlags.RexW));
|
||||||
Add(X86Instruction.Vfmsub231ss, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f38bb, InstructionFlags.Vex | InstructionFlags.Prefix66));
|
Add(X86Instruction.Vfmsub231ss, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f38bb, InstructionFlags.Vex | InstructionFlags.Prefix66));
|
||||||
|
|
|
@ -820,6 +820,7 @@ namespace ARMeilleure.Decoders
|
||||||
SetA32("111100110x00xxxxxxxx0001xxx1xxxx", InstName.Veor, InstEmit32.Veor_I, OpCode32SimdBinary.Create);
|
SetA32("111100110x00xxxxxxxx0001xxx1xxxx", InstName.Veor, InstEmit32.Veor_I, OpCode32SimdBinary.Create);
|
||||||
SetA32("111100101x11xxxxxxxxxxxxxxx0xxxx", InstName.Vext, InstEmit32.Vext, OpCode32SimdExt.Create);
|
SetA32("111100101x11xxxxxxxxxxxxxxx0xxxx", InstName.Vext, InstEmit32.Vext, OpCode32SimdExt.Create);
|
||||||
SetA32("<<<<11101x10xxxxxxxx101xx0x0xxxx", InstName.Vfma, InstEmit32.Vfma_S, OpCode32SimdRegS.Create);
|
SetA32("<<<<11101x10xxxxxxxx101xx0x0xxxx", InstName.Vfma, InstEmit32.Vfma_S, OpCode32SimdRegS.Create);
|
||||||
|
SetA32("111100100x00xxxxxxxx1100xxx1xxxx", InstName.Vfma, InstEmit32.Vfma_V, OpCode32SimdReg.Create);
|
||||||
SetA32("<<<<11101x10xxxxxxxx101xx1x0xxxx", InstName.Vfms, InstEmit32.Vfms_S, OpCode32SimdRegS.Create);
|
SetA32("<<<<11101x10xxxxxxxx101xx1x0xxxx", InstName.Vfms, InstEmit32.Vfms_S, OpCode32SimdRegS.Create);
|
||||||
SetA32("<<<<11101x01xxxxxxxx101xx1x0xxxx", InstName.Vfnma, InstEmit32.Vfnma_S, OpCode32SimdRegS.Create);
|
SetA32("<<<<11101x01xxxxxxxx101xx1x0xxxx", InstName.Vfnma, InstEmit32.Vfnma_S, OpCode32SimdRegS.Create);
|
||||||
SetA32("<<<<11101x01xxxxxxxx101xx0x0xxxx", InstName.Vfnms, InstEmit32.Vfnms_S, OpCode32SimdRegS.Create);
|
SetA32("<<<<11101x01xxxxxxxx101xx0x0xxxx", InstName.Vfnms, InstEmit32.Vfnms_S, OpCode32SimdRegS.Create);
|
||||||
|
|
|
@ -252,6 +252,23 @@ namespace ARMeilleure.Instructions
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static void Vfma_V(ArmEmitterContext context) // Fused.
|
||||||
|
{
|
||||||
|
if (Optimizations.FastFP && Optimizations.UseFma)
|
||||||
|
{
|
||||||
|
// Vectors contain elements that are 32-bits in length always. The only thing that will change is the number of elements in a vector.
|
||||||
|
// The 64-bit variant will never be used.
|
||||||
|
EmitVectorTernaryOpF32(context, Intrinsic.X86Vfmadd231ps, Intrinsic.X86Vfmadd231pd);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
EmitVectorTernaryOpF32(context, (op1, op2, op3) =>
|
||||||
|
{
|
||||||
|
return EmitSoftFloatCall(context, nameof(SoftFloat32.FPMulAdd), op1, op2, op3);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
public static void Vfma_S(ArmEmitterContext context) // Fused.
|
public static void Vfma_S(ArmEmitterContext context) // Fused.
|
||||||
{
|
{
|
||||||
if (Optimizations.FastFP && Optimizations.UseSse2)
|
if (Optimizations.FastFP && Optimizations.UseSse2)
|
||||||
|
|
|
@ -820,6 +820,18 @@ namespace ARMeilleure.Instructions
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static void EmitVectorTernaryOpF32(ArmEmitterContext context, Intrinsic inst32, Intrinsic inst64)
|
||||||
|
{
|
||||||
|
OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp;
|
||||||
|
|
||||||
|
Intrinsic inst = (op.Size & 1) != 0 ? inst64 : inst32;
|
||||||
|
|
||||||
|
EmitVectorTernaryOpSimd32(context, (d, n, m) =>
|
||||||
|
{
|
||||||
|
return context.AddIntrinsic(inst, d, n, m);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
public static void EmitScalarUnaryOpSimd32(ArmEmitterContext context, Func1I scalarFunc)
|
public static void EmitScalarUnaryOpSimd32(ArmEmitterContext context, Func1I scalarFunc)
|
||||||
{
|
{
|
||||||
OpCode32SimdS op = (OpCode32SimdS)context.CurrOp;
|
OpCode32SimdS op = (OpCode32SimdS)context.CurrOp;
|
||||||
|
|
|
@ -293,6 +293,52 @@ namespace Ryujinx.Tests.Cpu
|
||||||
CompareAgainstUnicorn(fpsrMask: Fpsr.Nzcv);
|
CompareAgainstUnicorn(fpsrMask: Fpsr.Nzcv);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
[Test, Pairwise, Description("VFMA.F<size> <Vd>, <Vn>, <Vm>")]
|
||||||
|
public void Vfma([Values(0u, 1u)] uint rd,
|
||||||
|
[Values(0u, 1u)] uint rn,
|
||||||
|
[Values(0u, 1u)] uint rm,
|
||||||
|
[Values(0u, 1u)] uint Q,
|
||||||
|
[ValueSource("_2S_F_")] ulong z,
|
||||||
|
[ValueSource("_2S_F_")] ulong a,
|
||||||
|
[ValueSource("_2S_F_")] ulong b )
|
||||||
|
{
|
||||||
|
uint opcode = 0xf2000c10;
|
||||||
|
|
||||||
|
V128 v0;
|
||||||
|
V128 v1;
|
||||||
|
V128 v2;
|
||||||
|
|
||||||
|
uint c = (uint) BitConverter.SingleToInt32Bits(z);
|
||||||
|
uint d = (uint) BitConverter.SingleToInt32Bits(a);
|
||||||
|
uint e = (uint) BitConverter.SingleToInt32Bits(b);
|
||||||
|
if (Q == 0)
|
||||||
|
{
|
||||||
|
opcode |= (((rm & 0x1) << 5) | (rm & 0x1e) >> 1);
|
||||||
|
opcode |= (((rd & 0x1) << 22) | (rd & 0x1e) << 11);
|
||||||
|
opcode |= (((rn & 0x1) << 7) | (rn & 0x1e) >> 15);
|
||||||
|
|
||||||
|
v0 = MakeVectorE0E1(c, c);
|
||||||
|
v1 = MakeVectorE0E1(d, c);
|
||||||
|
v2 = MakeVectorE0E1(e, c);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
rd = rn = rm = 0; // Needed, as these values cannot be odd values if Q == 1.
|
||||||
|
opcode |= (((rm & 0x10) << 1) | (rm & 0xf) << 0);
|
||||||
|
opcode |= (((rd & 0x10) << 18) | (rd & 0xf) << 12);
|
||||||
|
opcode |= (((rn & 0x10) << 3) | (rn & 0xf) << 16);
|
||||||
|
|
||||||
|
v0 = MakeVectorE0E1E2E3(c, c, d, e);
|
||||||
|
v1 = MakeVectorE0E1E2E3(d, c, e, c);
|
||||||
|
v2 = MakeVectorE0E1E2E3(e, c, d, c);
|
||||||
|
}
|
||||||
|
|
||||||
|
opcode |= ((Q & 1) << 6);
|
||||||
|
|
||||||
|
SingleOpcode(opcode, v0: v0, v1: v1, v2: v2);
|
||||||
|
CompareAgainstUnicorn();
|
||||||
|
}
|
||||||
|
|
||||||
[Test, Pairwise, Description("VFNMA.F<size> <Vd>, <Vn>, <Vm>")]
|
[Test, Pairwise, Description("VFNMA.F<size> <Vd>, <Vn>, <Vm>")]
|
||||||
public void Vfnma([Values(0u, 1u)] uint rd,
|
public void Vfnma([Values(0u, 1u)] uint rd,
|
||||||
[Values(0u, 1u)] uint rn,
|
[Values(0u, 1u)] uint rn,
|
||||||
|
|
Reference in a new issue