CPU (A64): Add Fmaxnmp & Fminnmp Scalar Inst.s, Fast & Slow Paths; with Tests. (#1894)
This commit is contained in:
parent
b8353f5639
commit
c3e0c41da3
7 changed files with 111 additions and 25 deletions
|
@ -119,6 +119,7 @@ namespace ARMeilleure.CodeGen.X86
|
||||||
Add(Intrinsic.X86Popcnt, new IntrinsicInfo(X86Instruction.Popcnt, IntrinsicType.PopCount));
|
Add(Intrinsic.X86Popcnt, new IntrinsicInfo(X86Instruction.Popcnt, IntrinsicType.PopCount));
|
||||||
Add(Intrinsic.X86Por, new IntrinsicInfo(X86Instruction.Por, IntrinsicType.Binary));
|
Add(Intrinsic.X86Por, new IntrinsicInfo(X86Instruction.Por, IntrinsicType.Binary));
|
||||||
Add(Intrinsic.X86Pshufb, new IntrinsicInfo(X86Instruction.Pshufb, IntrinsicType.Binary));
|
Add(Intrinsic.X86Pshufb, new IntrinsicInfo(X86Instruction.Pshufb, IntrinsicType.Binary));
|
||||||
|
Add(Intrinsic.X86Pshufd, new IntrinsicInfo(X86Instruction.Pshufd, IntrinsicType.BinaryImm));
|
||||||
Add(Intrinsic.X86Pslld, new IntrinsicInfo(X86Instruction.Pslld, IntrinsicType.Binary));
|
Add(Intrinsic.X86Pslld, new IntrinsicInfo(X86Instruction.Pslld, IntrinsicType.Binary));
|
||||||
Add(Intrinsic.X86Pslldq, new IntrinsicInfo(X86Instruction.Pslldq, IntrinsicType.Binary));
|
Add(Intrinsic.X86Pslldq, new IntrinsicInfo(X86Instruction.Pslldq, IntrinsicType.Binary));
|
||||||
Add(Intrinsic.X86Psllq, new IntrinsicInfo(X86Instruction.Psllq, IntrinsicType.Binary));
|
Add(Intrinsic.X86Psllq, new IntrinsicInfo(X86Instruction.Psllq, IntrinsicType.Binary));
|
||||||
|
|
|
@ -311,6 +311,7 @@ namespace ARMeilleure.Decoders
|
||||||
SetA64("0>0011100<1xxxxx111101xxxxxxxxxx", InstName.Fmax_V, InstEmit.Fmax_V, OpCodeSimdReg.Create);
|
SetA64("0>0011100<1xxxxx111101xxxxxxxxxx", InstName.Fmax_V, InstEmit.Fmax_V, OpCodeSimdReg.Create);
|
||||||
SetA64("000111100x1xxxxx011010xxxxxxxxxx", InstName.Fmaxnm_S, InstEmit.Fmaxnm_S, OpCodeSimdReg.Create);
|
SetA64("000111100x1xxxxx011010xxxxxxxxxx", InstName.Fmaxnm_S, InstEmit.Fmaxnm_S, OpCodeSimdReg.Create);
|
||||||
SetA64("0>0011100<1xxxxx110001xxxxxxxxxx", InstName.Fmaxnm_V, InstEmit.Fmaxnm_V, OpCodeSimdReg.Create);
|
SetA64("0>0011100<1xxxxx110001xxxxxxxxxx", InstName.Fmaxnm_V, InstEmit.Fmaxnm_V, OpCodeSimdReg.Create);
|
||||||
|
SetA64("011111100x110000110010xxxxxxxxxx", InstName.Fmaxnmp_S, InstEmit.Fmaxnmp_S, OpCodeSimd.Create);
|
||||||
SetA64("0>1011100<1xxxxx110001xxxxxxxxxx", InstName.Fmaxnmp_V, InstEmit.Fmaxnmp_V, OpCodeSimdReg.Create);
|
SetA64("0>1011100<1xxxxx110001xxxxxxxxxx", InstName.Fmaxnmp_V, InstEmit.Fmaxnmp_V, OpCodeSimdReg.Create);
|
||||||
SetA64("0110111000110000110010xxxxxxxxxx", InstName.Fmaxnmv_V, InstEmit.Fmaxnmv_V, OpCodeSimd.Create);
|
SetA64("0110111000110000110010xxxxxxxxxx", InstName.Fmaxnmv_V, InstEmit.Fmaxnmv_V, OpCodeSimd.Create);
|
||||||
SetA64("0>1011100<1xxxxx111101xxxxxxxxxx", InstName.Fmaxp_V, InstEmit.Fmaxp_V, OpCodeSimdReg.Create);
|
SetA64("0>1011100<1xxxxx111101xxxxxxxxxx", InstName.Fmaxp_V, InstEmit.Fmaxp_V, OpCodeSimdReg.Create);
|
||||||
|
@ -319,6 +320,7 @@ namespace ARMeilleure.Decoders
|
||||||
SetA64("0>0011101<1xxxxx111101xxxxxxxxxx", InstName.Fmin_V, InstEmit.Fmin_V, OpCodeSimdReg.Create);
|
SetA64("0>0011101<1xxxxx111101xxxxxxxxxx", InstName.Fmin_V, InstEmit.Fmin_V, OpCodeSimdReg.Create);
|
||||||
SetA64("000111100x1xxxxx011110xxxxxxxxxx", InstName.Fminnm_S, InstEmit.Fminnm_S, OpCodeSimdReg.Create);
|
SetA64("000111100x1xxxxx011110xxxxxxxxxx", InstName.Fminnm_S, InstEmit.Fminnm_S, OpCodeSimdReg.Create);
|
||||||
SetA64("0>0011101<1xxxxx110001xxxxxxxxxx", InstName.Fminnm_V, InstEmit.Fminnm_V, OpCodeSimdReg.Create);
|
SetA64("0>0011101<1xxxxx110001xxxxxxxxxx", InstName.Fminnm_V, InstEmit.Fminnm_V, OpCodeSimdReg.Create);
|
||||||
|
SetA64("011111101x110000110010xxxxxxxxxx", InstName.Fminnmp_S, InstEmit.Fminnmp_S, OpCodeSimd.Create);
|
||||||
SetA64("0>1011101<1xxxxx110001xxxxxxxxxx", InstName.Fminnmp_V, InstEmit.Fminnmp_V, OpCodeSimdReg.Create);
|
SetA64("0>1011101<1xxxxx110001xxxxxxxxxx", InstName.Fminnmp_V, InstEmit.Fminnmp_V, OpCodeSimdReg.Create);
|
||||||
SetA64("0110111010110000110010xxxxxxxxxx", InstName.Fminnmv_V, InstEmit.Fminnmv_V, OpCodeSimd.Create);
|
SetA64("0110111010110000110010xxxxxxxxxx", InstName.Fminnmv_V, InstEmit.Fminnmv_V, OpCodeSimd.Create);
|
||||||
SetA64("0>1011101<1xxxxx111101xxxxxxxxxx", InstName.Fminp_V, InstEmit.Fminp_V, OpCodeSimdReg.Create);
|
SetA64("0>1011101<1xxxxx111101xxxxxxxxxx", InstName.Fminp_V, InstEmit.Fminp_V, OpCodeSimdReg.Create);
|
||||||
|
|
|
@ -347,19 +347,17 @@ namespace ARMeilleure.Instructions
|
||||||
|
|
||||||
public static void Faddp_S(ArmEmitterContext context)
|
public static void Faddp_S(ArmEmitterContext context)
|
||||||
{
|
{
|
||||||
OpCodeSimd op = (OpCodeSimd)context.CurrOp;
|
|
||||||
|
|
||||||
int sizeF = op.Size & 1;
|
|
||||||
|
|
||||||
if (Optimizations.FastFP && Optimizations.UseSse3)
|
if (Optimizations.FastFP && Optimizations.UseSse3)
|
||||||
{
|
{
|
||||||
if (sizeF == 0)
|
OpCodeSimd op = (OpCodeSimd)context.CurrOp;
|
||||||
|
|
||||||
|
if ((op.Size & 1) == 0)
|
||||||
{
|
{
|
||||||
Operand res = context.AddIntrinsic(Intrinsic.X86Haddps, GetVec(op.Rn), GetVec(op.Rn));
|
Operand res = context.AddIntrinsic(Intrinsic.X86Haddps, GetVec(op.Rn), GetVec(op.Rn));
|
||||||
|
|
||||||
context.Copy(GetVec(op.Rd), context.VectorZeroUpper96(res));
|
context.Copy(GetVec(op.Rd), context.VectorZeroUpper96(res));
|
||||||
}
|
}
|
||||||
else /* if (sizeF == 1) */
|
else /* if ((op.Size & 1) == 1) */
|
||||||
{
|
{
|
||||||
Operand res = context.AddIntrinsic(Intrinsic.X86Haddpd, GetVec(op.Rn), GetVec(op.Rn));
|
Operand res = context.AddIntrinsic(Intrinsic.X86Haddpd, GetVec(op.Rn), GetVec(op.Rn));
|
||||||
|
|
||||||
|
@ -368,14 +366,10 @@ namespace ARMeilleure.Instructions
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
OperandType type = sizeF != 0 ? OperandType.FP64 : OperandType.FP32;
|
EmitScalarPairwiseOpF(context, (op1, op2) =>
|
||||||
|
{
|
||||||
Operand ne0 = context.VectorExtract(type, GetVec(op.Rn), 0);
|
return EmitSoftFloatCall(context, nameof(SoftFloat32.FPAdd), op1, op2);
|
||||||
Operand ne1 = context.VectorExtract(type, GetVec(op.Rn), 1);
|
});
|
||||||
|
|
||||||
Operand res = EmitSoftFloatCall(context, nameof(SoftFloat32.FPAdd), ne0, ne1);
|
|
||||||
|
|
||||||
context.Copy(GetVec(op.Rd), context.VectorInsert(context.VectorZero(), res, 0));
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -552,6 +546,24 @@ namespace ARMeilleure.Instructions
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static void Fmaxnmp_S(ArmEmitterContext context)
|
||||||
|
{
|
||||||
|
if (Optimizations.FastFP && Optimizations.UseSse41)
|
||||||
|
{
|
||||||
|
EmitSse2ScalarPairwiseOpF(context, (op1, op2) =>
|
||||||
|
{
|
||||||
|
return EmitSse41MaxMinNumOpF(context, isMaxNum: true, scalar: true, op1, op2);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
EmitScalarPairwiseOpF(context, (op1, op2) =>
|
||||||
|
{
|
||||||
|
return EmitSoftFloatCall(context, nameof(SoftFloat32.FPMaxNum), op1, op2);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
public static void Fmaxnmp_V(ArmEmitterContext context)
|
public static void Fmaxnmp_V(ArmEmitterContext context)
|
||||||
{
|
{
|
||||||
if (Optimizations.FastFP && Optimizations.UseSse41)
|
if (Optimizations.FastFP && Optimizations.UseSse41)
|
||||||
|
@ -708,6 +720,24 @@ namespace ARMeilleure.Instructions
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static void Fminnmp_S(ArmEmitterContext context)
|
||||||
|
{
|
||||||
|
if (Optimizations.FastFP && Optimizations.UseSse41)
|
||||||
|
{
|
||||||
|
EmitSse2ScalarPairwiseOpF(context, (op1, op2) =>
|
||||||
|
{
|
||||||
|
return EmitSse41MaxMinNumOpF(context, isMaxNum: false, scalar: true, op1, op2);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
EmitScalarPairwiseOpF(context, (op1, op2) =>
|
||||||
|
{
|
||||||
|
return EmitSoftFloatCall(context, nameof(SoftFloat32.FPMinNum), op1, op2);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
public static void Fminnmp_V(ArmEmitterContext context)
|
public static void Fminnmp_V(ArmEmitterContext context)
|
||||||
{
|
{
|
||||||
if (Optimizations.FastFP && Optimizations.UseSse41)
|
if (Optimizations.FastFP && Optimizations.UseSse41)
|
||||||
|
|
|
@ -1118,6 +1118,49 @@ namespace ARMeilleure.Instructions
|
||||||
context.Copy(GetVec(op.Rd), context.VectorZeroUpper96(res));
|
context.Copy(GetVec(op.Rd), context.VectorZeroUpper96(res));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static void EmitScalarPairwiseOpF(ArmEmitterContext context, Func2I emit)
|
||||||
|
{
|
||||||
|
OpCodeSimd op = (OpCodeSimd)context.CurrOp;
|
||||||
|
|
||||||
|
OperandType type = (op.Size & 1) != 0 ? OperandType.FP64 : OperandType.FP32;
|
||||||
|
|
||||||
|
Operand ne0 = context.VectorExtract(type, GetVec(op.Rn), 0);
|
||||||
|
Operand ne1 = context.VectorExtract(type, GetVec(op.Rn), 1);
|
||||||
|
|
||||||
|
Operand res = context.VectorInsert(context.VectorZero(), emit(ne0, ne1), 0);
|
||||||
|
|
||||||
|
context.Copy(GetVec(op.Rd), res);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void EmitSse2ScalarPairwiseOpF(ArmEmitterContext context, Func2I emit)
|
||||||
|
{
|
||||||
|
OpCodeSimd op = (OpCodeSimd)context.CurrOp;
|
||||||
|
|
||||||
|
Operand n = GetVec(op.Rn);
|
||||||
|
|
||||||
|
Operand op0, op1;
|
||||||
|
|
||||||
|
if ((op.Size & 1) == 0)
|
||||||
|
{
|
||||||
|
const int sm0 = 2 << 6 | 2 << 4 | 2 << 2 | 0 << 0;
|
||||||
|
const int sm1 = 2 << 6 | 2 << 4 | 2 << 2 | 1 << 0;
|
||||||
|
|
||||||
|
Operand zeroN = context.VectorZeroUpper64(n);
|
||||||
|
|
||||||
|
op0 = context.AddIntrinsic(Intrinsic.X86Pshufd, zeroN, Const(sm0));
|
||||||
|
op1 = context.AddIntrinsic(Intrinsic.X86Pshufd, zeroN, Const(sm1));
|
||||||
|
}
|
||||||
|
else /* if ((op.Size & 1) == 1) */
|
||||||
|
{
|
||||||
|
Operand zero = context.VectorZero();
|
||||||
|
|
||||||
|
op0 = context.AddIntrinsic(Intrinsic.X86Movlhps, n, zero);
|
||||||
|
op1 = context.AddIntrinsic(Intrinsic.X86Movhlps, zero, n);
|
||||||
|
}
|
||||||
|
|
||||||
|
context.Copy(GetVec(op.Rd), emit(op0, op1));
|
||||||
|
}
|
||||||
|
|
||||||
public static void EmitVectorPairwiseOpF(ArmEmitterContext context, Func2I emit)
|
public static void EmitVectorPairwiseOpF(ArmEmitterContext context, Func2I emit)
|
||||||
{
|
{
|
||||||
OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp;
|
OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp;
|
||||||
|
|
|
@ -212,6 +212,7 @@ namespace ARMeilleure.Instructions
|
||||||
Fmax_V,
|
Fmax_V,
|
||||||
Fmaxnm_S,
|
Fmaxnm_S,
|
||||||
Fmaxnm_V,
|
Fmaxnm_V,
|
||||||
|
Fmaxnmp_S,
|
||||||
Fmaxnmp_V,
|
Fmaxnmp_V,
|
||||||
Fmaxnmv_V,
|
Fmaxnmv_V,
|
||||||
Fmaxp_V,
|
Fmaxp_V,
|
||||||
|
@ -220,6 +221,7 @@ namespace ARMeilleure.Instructions
|
||||||
Fmin_V,
|
Fmin_V,
|
||||||
Fminnm_S,
|
Fminnm_S,
|
||||||
Fminnm_V,
|
Fminnm_V,
|
||||||
|
Fminnmp_S,
|
||||||
Fminnmp_V,
|
Fminnmp_V,
|
||||||
Fminnmv_V,
|
Fminnmv_V,
|
||||||
Fminp_V,
|
Fminp_V,
|
||||||
|
|
|
@ -108,6 +108,7 @@ namespace ARMeilleure.IntermediateRepresentation
|
||||||
X86Popcnt,
|
X86Popcnt,
|
||||||
X86Por,
|
X86Por,
|
||||||
X86Pshufb,
|
X86Pshufb,
|
||||||
|
X86Pshufd,
|
||||||
X86Pslld,
|
X86Pslld,
|
||||||
X86Pslldq,
|
X86Pslldq,
|
||||||
X86Psllq,
|
X86Psllq,
|
||||||
|
|
|
@ -715,19 +715,23 @@ namespace Ryujinx.Tests.Cpu
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
private static uint[] _F_Add_P_S_2SS_()
|
private static uint[] _F_Add_Max_Min_Nm_P_S_2SS_()
|
||||||
{
|
{
|
||||||
return new uint[]
|
return new uint[]
|
||||||
{
|
{
|
||||||
0x7E30D820u // FADDP S0, V1.2S
|
0x7E30D820u, // FADDP S0, V1.2S
|
||||||
|
0x7E30C820u, // FMAXNMP S0, V1.2S
|
||||||
|
0x7EB0C820u // FMINNMP S0, V1.2S
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
private static uint[] _F_Add_P_S_2DD_()
|
private static uint[] _F_Add_Max_Min_Nm_P_S_2DD_()
|
||||||
{
|
{
|
||||||
return new uint[]
|
return new uint[]
|
||||||
{
|
{
|
||||||
0x7E70D820u // FADDP D0, V1.2D
|
0x7E70D820u, // FADDP D0, V1.2D
|
||||||
|
0x7E70C820u, // FMAXNMP D0, V1.2D
|
||||||
|
0x7EF0C820u // FMINNMP D0, V1.2D
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1802,12 +1806,13 @@ namespace Ryujinx.Tests.Cpu
|
||||||
}
|
}
|
||||||
|
|
||||||
[Test, Pairwise] [Explicit]
|
[Test, Pairwise] [Explicit]
|
||||||
public void F_Add_P_S_2SS([ValueSource("_F_Add_P_S_2SS_")] uint opcodes,
|
public void F_Add_Max_Min_Nm_P_S_2SS([ValueSource("_F_Add_Max_Min_Nm_P_S_2SS_")] uint opcodes,
|
||||||
[ValueSource("_2S_F_")] ulong a)
|
[ValueSource("_2S_F_")] ulong a)
|
||||||
{
|
{
|
||||||
ulong z = TestContext.CurrentContext.Random.NextULong();
|
ulong z = TestContext.CurrentContext.Random.NextULong();
|
||||||
|
|
||||||
V128 v0 = MakeVectorE0E1(z, z);
|
V128 v0 = MakeVectorE0E1(z, z);
|
||||||
V128 v1 = MakeVectorE0(a);
|
V128 v1 = MakeVectorE0E1(a, z);
|
||||||
|
|
||||||
int rnd = (int)TestContext.CurrentContext.Random.NextUInt();
|
int rnd = (int)TestContext.CurrentContext.Random.NextUInt();
|
||||||
|
|
||||||
|
@ -1820,12 +1825,14 @@ namespace Ryujinx.Tests.Cpu
|
||||||
}
|
}
|
||||||
|
|
||||||
[Test, Pairwise] [Explicit]
|
[Test, Pairwise] [Explicit]
|
||||||
public void F_Add_P_S_2DD([ValueSource("_F_Add_P_S_2DD_")] uint opcodes,
|
public void F_Add_Max_Min_Nm_P_S_2DD([ValueSource("_F_Add_Max_Min_Nm_P_S_2DD_")] uint opcodes,
|
||||||
[ValueSource("_1D_F_")] ulong a)
|
[ValueSource("_1D_F_")] ulong a0,
|
||||||
|
[ValueSource("_1D_F_")] ulong a1)
|
||||||
{
|
{
|
||||||
ulong z = TestContext.CurrentContext.Random.NextULong();
|
ulong z = TestContext.CurrentContext.Random.NextULong();
|
||||||
V128 v0 = MakeVectorE1(z);
|
|
||||||
V128 v1 = MakeVectorE0E1(a, a);
|
V128 v0 = MakeVectorE0E1(z, z);
|
||||||
|
V128 v1 = MakeVectorE0E1(a0, a1);
|
||||||
|
|
||||||
int rnd = (int)TestContext.CurrentContext.Random.NextUInt();
|
int rnd = (int)TestContext.CurrentContext.Random.NextUInt();
|
||||||
|
|
||||||
|
|
Reference in a new issue