Add VCLZ.* fast path (#1917)
* Add VCLZ fast path * Add VCLZ.8B/16B SSSE3 fast path * Add VCLZ.4H/8H SSSE3 fast path * Add VCLZ.2S/4S SSE2 fast path * Improve CLZ.4H/8H fast path * Improve CLZ.2S/4S fast path * Set PPTC version
This commit is contained in:
parent
f94acdb4ef
commit
ddf1105bcb
3 changed files with 145 additions and 9 deletions
|
@ -120,24 +120,155 @@ namespace ARMeilleure.Instructions
|
|||
{
|
||||
OpCodeSimd op = (OpCodeSimd)context.CurrOp;
|
||||
|
||||
Operand res = context.VectorZero();
|
||||
|
||||
int elems = op.GetBytesCount() >> op.Size;
|
||||
|
||||
int eSize = 8 << op.Size;
|
||||
|
||||
for (int index = 0; index < elems; index++)
|
||||
Operand res = eSize switch {
|
||||
8 => Clz_V_I8 (context, GetVec(op.Rn)),
|
||||
16 => Clz_V_I16(context, GetVec(op.Rn)),
|
||||
32 => Clz_V_I32(context, GetVec(op.Rn)),
|
||||
_ => null
|
||||
};
|
||||
|
||||
if (res != null)
|
||||
{
|
||||
Operand ne = EmitVectorExtractZx(context, op.Rn, index, op.Size);
|
||||
if (op.RegisterSize == RegisterSize.Simd64)
|
||||
{
|
||||
res = context.VectorZeroUpper64(res);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
int elems = op.GetBytesCount() >> op.Size;
|
||||
|
||||
Operand de = context.Call(typeof(SoftFallback).GetMethod(nameof(SoftFallback.CountLeadingZeros)), ne, Const(eSize));
|
||||
res = context.VectorZero();
|
||||
|
||||
res = EmitVectorInsert(context, res, de, index, op.Size);
|
||||
for (int index = 0; index < elems; index++)
|
||||
{
|
||||
Operand ne = EmitVectorExtractZx(context, op.Rn, index, op.Size);
|
||||
|
||||
Operand de = context.Call(typeof(SoftFallback).GetMethod(nameof(SoftFallback.CountLeadingZeros)), ne, Const(eSize));
|
||||
|
||||
res = EmitVectorInsert(context, res, de, index, op.Size);
|
||||
}
|
||||
}
|
||||
|
||||
context.Copy(GetVec(op.Rd), res);
|
||||
}
|
||||
|
||||
private static Operand Clz_V_I8(ArmEmitterContext context, Operand arg)
|
||||
{
|
||||
if (!Optimizations.UseSsse3)
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
// CLZ nibble table.
|
||||
Operand clzTable = X86GetScalar(context, 0x01_01_01_01_02_02_03_04);
|
||||
|
||||
Operand maskLow = X86GetAllElements(context, 0x0f_0f_0f_0f);
|
||||
Operand c04 = X86GetAllElements(context, 0x04_04_04_04);
|
||||
|
||||
// CLZ of low 4 bits of elements in arg.
|
||||
Operand loClz = context.AddIntrinsic(Intrinsic.X86Pshufb, clzTable, arg);
|
||||
|
||||
// Get the high 4 bits of elements in arg.
|
||||
Operand hiArg = context.AddIntrinsic(Intrinsic.X86Psrlw, arg, Const(4));
|
||||
hiArg = context.AddIntrinsic(Intrinsic.X86Pand, hiArg, maskLow);
|
||||
|
||||
// CLZ of high 4 bits of elements in arg.
|
||||
Operand hiClz = context.AddIntrinsic(Intrinsic.X86Pshufb, clzTable, hiArg);
|
||||
|
||||
// If high 4 bits are not all zero, we discard the CLZ of the low 4 bits.
|
||||
Operand mask = context.AddIntrinsic(Intrinsic.X86Pcmpeqb, hiClz, c04);
|
||||
loClz = context.AddIntrinsic(Intrinsic.X86Pand, loClz, mask);
|
||||
|
||||
return context.AddIntrinsic(Intrinsic.X86Paddb, loClz, hiClz);
|
||||
}
|
||||
|
||||
private static Operand Clz_V_I16(ArmEmitterContext context, Operand arg)
|
||||
{
|
||||
if (!Optimizations.UseSsse3)
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
Operand maskSwap = X86GetElements(context, 0x80_0f_80_0d_80_0b_80_09, 0x80_07_80_05_80_03_80_01);
|
||||
Operand maskLow = X86GetAllElements(context, 0x00ff_00ff);
|
||||
Operand c0008 = X86GetAllElements(context, 0x0008_0008);
|
||||
|
||||
// CLZ pair of high 8 and low 8 bits of elements in arg.
|
||||
Operand hiloClz = Clz_V_I8(context, arg);
|
||||
// Get CLZ of low 8 bits in each pair.
|
||||
Operand loClz = context.AddIntrinsic(Intrinsic.X86Pand, hiloClz, maskLow);
|
||||
// Get CLZ of high 8 bits in each pair.
|
||||
Operand hiClz = context.AddIntrinsic(Intrinsic.X86Pshufb, hiloClz, maskSwap);
|
||||
|
||||
// If high 8 bits are not all zero, we discard the CLZ of the low 8 bits.
|
||||
Operand mask = context.AddIntrinsic(Intrinsic.X86Pcmpeqw, hiClz, c0008);
|
||||
loClz = context.AddIntrinsic(Intrinsic.X86Pand, loClz, mask);
|
||||
|
||||
return context.AddIntrinsic(Intrinsic.X86Paddw, loClz, hiClz);
|
||||
}
|
||||
|
||||
private static Operand Clz_V_I32(ArmEmitterContext context, Operand arg)
|
||||
{
|
||||
// TODO: Use vplzcntd when AVX-512 is supported.
|
||||
if (!Optimizations.UseSse2)
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
Operand AddVectorI32(Operand op0, Operand op1) => context.AddIntrinsic(Intrinsic.X86Paddd, op0, op1);
|
||||
Operand SubVectorI32(Operand op0, Operand op1) => context.AddIntrinsic(Intrinsic.X86Psubd, op0, op1);
|
||||
Operand ShiftRightVectorUI32(Operand op0, int imm8) => context.AddIntrinsic(Intrinsic.X86Psrld, op0, Const(imm8));
|
||||
Operand OrVector(Operand op0, Operand op1) => context.AddIntrinsic(Intrinsic.X86Por, op0, op1);
|
||||
Operand AndVector(Operand op0, Operand op1) => context.AddIntrinsic(Intrinsic.X86Pand, op0, op1);
|
||||
Operand NotVector(Operand op0) => context.AddIntrinsic(Intrinsic.X86Pandn, op0, context.VectorOne());
|
||||
|
||||
Operand c55555555 = X86GetAllElements(context, 0x55555555);
|
||||
Operand c33333333 = X86GetAllElements(context, 0x33333333);
|
||||
Operand c0f0f0f0f = X86GetAllElements(context, 0x0f0f0f0f);
|
||||
Operand c0000003f = X86GetAllElements(context, 0x0000003f);
|
||||
|
||||
Operand tmp0;
|
||||
Operand tmp1;
|
||||
Operand res;
|
||||
|
||||
// Set all bits after highest set bit to 1.
|
||||
res = OrVector(ShiftRightVectorUI32(arg, 1), arg);
|
||||
res = OrVector(ShiftRightVectorUI32(res, 2), res);
|
||||
res = OrVector(ShiftRightVectorUI32(res, 4), res);
|
||||
res = OrVector(ShiftRightVectorUI32(res, 8), res);
|
||||
res = OrVector(ShiftRightVectorUI32(res, 16), res);
|
||||
|
||||
// Make leading 0s into leading 1s.
|
||||
res = NotVector(res);
|
||||
|
||||
// Count leading 1s, which is the population count.
|
||||
tmp0 = ShiftRightVectorUI32(res, 1);
|
||||
tmp0 = AndVector(tmp0, c55555555);
|
||||
res = SubVectorI32(res, tmp0);
|
||||
|
||||
tmp0 = ShiftRightVectorUI32(res, 2);
|
||||
tmp0 = AndVector(tmp0, c33333333);
|
||||
tmp1 = AndVector(res, c33333333);
|
||||
res = AddVectorI32(tmp0, tmp1);
|
||||
|
||||
tmp0 = ShiftRightVectorUI32(res, 4);
|
||||
tmp0 = AddVectorI32(tmp0, res);
|
||||
res = AndVector(tmp0, c0f0f0f0f);
|
||||
|
||||
tmp0 = ShiftRightVectorUI32(res, 8);
|
||||
res = AddVectorI32(tmp0, res);
|
||||
|
||||
tmp0 = ShiftRightVectorUI32(res, 16);
|
||||
res = AddVectorI32(tmp0, res);
|
||||
|
||||
res = AndVector(res, c0000003f);
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
public static void Cnt_V(ArmEmitterContext context)
|
||||
{
|
||||
OpCodeSimd op = (OpCodeSimd)context.CurrOp;
|
||||
|
|
|
@ -209,6 +209,11 @@ namespace ARMeilleure.Instructions
|
|||
}
|
||||
|
||||
public static Operand X86GetElements(ArmEmitterContext context, long e1, long e0)
|
||||
{
|
||||
return X86GetElements(context, (ulong)e1, (ulong)e0);
|
||||
}
|
||||
|
||||
public static Operand X86GetElements(ArmEmitterContext context, ulong e1, ulong e0)
|
||||
{
|
||||
Operand vector0 = context.VectorCreateScalar(Const(e0));
|
||||
Operand vector1 = context.VectorCreateScalar(Const(e1));
|
||||
|
|
|
@ -22,7 +22,7 @@ namespace ARMeilleure.Translation.PTC
|
|||
{
|
||||
private const string HeaderMagic = "PTChd";
|
||||
|
||||
private const int InternalVersion = 1817; //! To be incremented manually for each change to the ARMeilleure project.
|
||||
private const int InternalVersion = 1917; //! To be incremented manually for each change to the ARMeilleure project.
|
||||
|
||||
private const string ActualDir = "0";
|
||||
private const string BackupDir = "1";
|
||||
|
|
Reference in a new issue