0
0
Fork 0
mirror of https://github.com/ryujinx-mirror/ryujinx.git synced 2024-12-23 12:45:50 +00:00
ryujinx-fork/ARMeilleure/Instructions/InstEmitSimdShift.cs
LDj3SNuD 5e724cf24e
Add Profiled Persistent Translation Cache. (#769)
* Delete DelegateTypes.cs

* Delete DelegateCache.cs

* Add files via upload

* Update Horizon.cs

* Update Program.cs

* Update MainWindow.cs

* Update Aot.cs

* Update RelocEntry.cs

* Update Translator.cs

* Update MemoryManager.cs

* Update InstEmitMemoryHelper.cs

* Update Delegates.cs

* Nit.

* Nit.

* Nit.

* 10 fewer MSIL bytes for us

* Add comment. Nits.

* Update Translator.cs

* Update Aot.cs

* Nits.

* Opt..

* Opt..

* Opt..

* Opt..

* Allow to change compression level.

* Update MemoryManager.cs

* Update Translator.cs

* Manage corner cases during the save phase. Nits.

* Update Aot.cs

* Translator response tweak for Aot disabled. Nit.

* Nit.

* Nits.

* Create DelegateHelpers.cs

* Update Delegates.cs

* Nit.

* Nit.

* Nits.

* Fix due to #784.

* Fixes due to #757 & #841.

* Fix due to #846.

* Fix due to #847.

* Use MethodInfo for managed method calls.

Use IR methods instead of managed methods about Max/Min (S/U).
Follow-ups & Nits.

* Add missing exception messages.

Reintroduce slow path for Fmov_Vi.
Implement slow path for Fmov_Si.

* Switch to the new folder structure.

Nits.

* Impl. index-based relocation information. Impl. cache file version field.

* Nit.

* Address gdkchan comments.

Mainly:
- fixed cache file corruption issue on exit; - exposed a way to disable AOT on the GUI.

* Address AcK77 comment.

* Address Thealexbarney, jduncanator & emmauss comments.

Header magic, CpuId (FI) & Aot -> Ptc.

* Adaptation to the new application reloading system.

Improvements to the call system of managed methods.
Follow-ups.
Nits.

* Get the same boot times as on master when PTC is disabled.

* Profiled Aot.

* A32 support (#897).

* #975 support (1 of 2).

* #975 support (2 of 2).

* Rebase fix & nits.

* Some fixes and nits (still one bug left).

* One fix & nits.

* Tests fix (by gdk) & nits.

* Support translations not only in high quality and rejit.

Nits.

* Added possibility to skip translations and continue execution, using `ESC` key.

* Update SettingsWindow.cs

* Update GLRenderer.cs

* Update Ptc.cs

* Disabled Profiled PTC by default as requested in the past by gdk.

* Fix rejit bug. Increased number of parallel translations. Add stack unwinding stuffs support (1 of 2).

Nits.

* Add stack unwinding stuffs support (2 of 2). Tuned number of parallel translations.

* Restored the ability to assemble jumps with 8-bit offset when Profiled PTC is disabled or during profiling.

Modifications due to rebase.
Nits.

* Limited profiling of the functions to be translated to the addresses belonging to the range of static objects only.

* Nits.

* Nits.

* Update Delegates.cs

* Nit.

* Update InstEmitSimdArithmetic.cs

* Address riperiperi comments.

* Fixed the issue of unjustifiably longer boot times at the second boot than at the first boot, measured at the same time or reference point and with the same number of translated functions.

* Implemented a simple redundant load/save mechanism.

Halved the value of Decoder.MaxInstsPerFunction more appropriate for the current performance of the Translator.
Replaced by Logger.PrintError to Logger.PrintDebug in TexturePool.cs about the supposed invalid texture format to avoid the spawn of the log.
Nits.

* Nit.

Improved Logger.PrintError in TexturePool.cs to avoid log spawn.
Added missing code for FZ handling (in output) for fp max/min instructions (slow paths).

* Add configuration migration for PTC

Co-authored-by: Thog <me@thog.eu>
2020-06-16 20:28:02 +02:00

1178 lines
38 KiB
C#

// https://github.com/intel/ARM_NEON_2_x86_SSE/blob/master/NEON_2_SSE.h
using ARMeilleure.Decoders;
using ARMeilleure.IntermediateRepresentation;
using ARMeilleure.Translation;
using System;
using System.Diagnostics;
using System.Reflection;
using static ARMeilleure.Instructions.InstEmitHelper;
using static ARMeilleure.Instructions.InstEmitSimdHelper;
using static ARMeilleure.IntermediateRepresentation.OperandHelper;
namespace ARMeilleure.Instructions
{
using Func2I = Func<Operand, Operand, Operand>;
static partial class InstEmit
{
#region "Masks"
private static readonly long[] _masks_SliSri = new long[] // Replication masks.
{
0x0101010101010101L, 0x0001000100010001L, 0x0000000100000001L, 0x0000000000000001L
};
#endregion
public static void Rshrn_V(ArmEmitterContext context)
{
if (Optimizations.UseSsse3)
{
OpCodeSimdShImm op = (OpCodeSimdShImm)context.CurrOp;
int shift = GetImmShr(op);
long roundConst = 1L << (shift - 1);
Operand d = GetVec(op.Rd);
Operand n = GetVec(op.Rn);
Operand dLow = context.VectorZeroUpper64(d);
Operand mask = null;
switch (op.Size + 1)
{
case 1: mask = X86GetAllElements(context, (int)roundConst * 0x00010001); break;
case 2: mask = X86GetAllElements(context, (int)roundConst); break;
case 3: mask = X86GetAllElements(context, roundConst); break;
}
Intrinsic addInst = X86PaddInstruction[op.Size + 1];
Operand res = context.AddIntrinsic(addInst, n, mask);
Intrinsic srlInst = X86PsrlInstruction[op.Size + 1];
res = context.AddIntrinsic(srlInst, res, Const(shift));
Operand mask2 = X86GetAllElements(context, EvenMasks[op.Size]);
res = context.AddIntrinsic(Intrinsic.X86Pshufb, res, mask2);
Intrinsic movInst = op.RegisterSize == RegisterSize.Simd128
? Intrinsic.X86Movlhps
: Intrinsic.X86Movhlps;
res = context.AddIntrinsic(movInst, dLow, res);
context.Copy(d, res);
}
else
{
EmitVectorShrImmNarrowOpZx(context, round: true);
}
}
public static void Shl_S(ArmEmitterContext context)
{
OpCodeSimdShImm op = (OpCodeSimdShImm)context.CurrOp;
int shift = GetImmShl(op);
EmitScalarUnaryOpZx(context, (op1) => context.ShiftLeft(op1, Const(shift)));
}
public static void Shl_V(ArmEmitterContext context)
{
OpCodeSimdShImm op = (OpCodeSimdShImm)context.CurrOp;
int shift = GetImmShl(op);
if (Optimizations.UseSse2 && op.Size > 0)
{
Operand n = GetVec(op.Rn);
Intrinsic sllInst = X86PsllInstruction[op.Size];
Operand res = context.AddIntrinsic(sllInst, n, Const(shift));
if (op.RegisterSize == RegisterSize.Simd64)
{
res = context.VectorZeroUpper64(res);
}
context.Copy(GetVec(op.Rd), res);
}
else
{
EmitVectorUnaryOpZx(context, (op1) => context.ShiftLeft(op1, Const(shift)));
}
}
public static void Shll_V(ArmEmitterContext context)
{
OpCodeSimd op = (OpCodeSimd)context.CurrOp;
int shift = 8 << op.Size;
if (Optimizations.UseSse41)
{
Operand n = GetVec(op.Rn);
if (op.RegisterSize == RegisterSize.Simd128)
{
n = context.AddIntrinsic(Intrinsic.X86Psrldq, n, Const(8));
}
Intrinsic movsxInst = X86PmovsxInstruction[op.Size];
Operand res = context.AddIntrinsic(movsxInst, n);
Intrinsic sllInst = X86PsllInstruction[op.Size + 1];
res = context.AddIntrinsic(sllInst, res, Const(shift));
context.Copy(GetVec(op.Rd), res);
}
else
{
EmitVectorShImmWidenBinaryZx(context, (op1, op2) => context.ShiftLeft(op1, op2), shift);
}
}
public static void Shrn_V(ArmEmitterContext context)
{
if (Optimizations.UseSsse3)
{
OpCodeSimdShImm op = (OpCodeSimdShImm)context.CurrOp;
int shift = GetImmShr(op);
Operand d = GetVec(op.Rd);
Operand n = GetVec(op.Rn);
Operand dLow = context.VectorZeroUpper64(d);
Intrinsic srlInst = X86PsrlInstruction[op.Size + 1];
Operand nShifted = context.AddIntrinsic(srlInst, n, Const(shift));
Operand mask = X86GetAllElements(context, EvenMasks[op.Size]);
Operand res = context.AddIntrinsic(Intrinsic.X86Pshufb, nShifted, mask);
Intrinsic movInst = op.RegisterSize == RegisterSize.Simd128
? Intrinsic.X86Movlhps
: Intrinsic.X86Movhlps;
res = context.AddIntrinsic(movInst, dLow, res);
context.Copy(d, res);
}
else
{
EmitVectorShrImmNarrowOpZx(context, round: false);
}
}
public static void Sli_S(ArmEmitterContext context)
{
EmitSli(context, scalar: true);
}
public static void Sli_V(ArmEmitterContext context)
{
EmitSli(context, scalar: false);
}
public static void Sqrshl_V(ArmEmitterContext context)
{
OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp;
Operand res = context.VectorZero();
int elems = op.GetBytesCount() >> op.Size;
for (int index = 0; index < elems; index++)
{
Operand ne = EmitVectorExtractSx(context, op.Rn, index, op.Size);
Operand me = EmitVectorExtractSx(context, op.Rm, index, op.Size);
Operand e = context.Call(typeof(SoftFallback).GetMethod(nameof(SoftFallback.SignedShlRegSatQ)), ne, me, Const(1), Const(op.Size));
res = EmitVectorInsert(context, res, e, index, op.Size);
}
context.Copy(GetVec(op.Rd), res);
}
public static void Sqrshrn_S(ArmEmitterContext context)
{
EmitRoundShrImmSaturatingNarrowOp(context, ShrImmSaturatingNarrowFlags.ScalarSxSx);
}
public static void Sqrshrn_V(ArmEmitterContext context)
{
EmitRoundShrImmSaturatingNarrowOp(context, ShrImmSaturatingNarrowFlags.VectorSxSx);
}
public static void Sqrshrun_S(ArmEmitterContext context)
{
EmitRoundShrImmSaturatingNarrowOp(context, ShrImmSaturatingNarrowFlags.ScalarSxZx);
}
public static void Sqrshrun_V(ArmEmitterContext context)
{
EmitRoundShrImmSaturatingNarrowOp(context, ShrImmSaturatingNarrowFlags.VectorSxZx);
}
public static void Sqshl_V(ArmEmitterContext context)
{
OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp;
Operand res = context.VectorZero();
int elems = op.GetBytesCount() >> op.Size;
for (int index = 0; index < elems; index++)
{
Operand ne = EmitVectorExtractSx(context, op.Rn, index, op.Size);
Operand me = EmitVectorExtractSx(context, op.Rm, index, op.Size);
Operand e = context.Call(typeof(SoftFallback).GetMethod(nameof(SoftFallback.SignedShlRegSatQ)), ne, me, Const(0), Const(op.Size));
res = EmitVectorInsert(context, res, e, index, op.Size);
}
context.Copy(GetVec(op.Rd), res);
}
public static void Sqshrn_S(ArmEmitterContext context)
{
EmitShrImmSaturatingNarrowOp(context, ShrImmSaturatingNarrowFlags.ScalarSxSx);
}
public static void Sqshrn_V(ArmEmitterContext context)
{
EmitShrImmSaturatingNarrowOp(context, ShrImmSaturatingNarrowFlags.VectorSxSx);
}
public static void Sqshrun_S(ArmEmitterContext context)
{
EmitShrImmSaturatingNarrowOp(context, ShrImmSaturatingNarrowFlags.ScalarSxZx);
}
public static void Sqshrun_V(ArmEmitterContext context)
{
EmitShrImmSaturatingNarrowOp(context, ShrImmSaturatingNarrowFlags.VectorSxZx);
}
public static void Sri_S(ArmEmitterContext context)
{
EmitSri(context, scalar: true);
}
public static void Sri_V(ArmEmitterContext context)
{
EmitSri(context, scalar: false);
}
public static void Srshl_V(ArmEmitterContext context)
{
OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp;
Operand res = context.VectorZero();
int elems = op.GetBytesCount() >> op.Size;
for (int index = 0; index < elems; index++)
{
Operand ne = EmitVectorExtractSx(context, op.Rn, index, op.Size);
Operand me = EmitVectorExtractSx(context, op.Rm, index, op.Size);
Operand e = context.Call(typeof(SoftFallback).GetMethod(nameof(SoftFallback.SignedShlReg)), ne, me, Const(1), Const(op.Size));
res = EmitVectorInsert(context, res, e, index, op.Size);
}
context.Copy(GetVec(op.Rd), res);
}
public static void Srshr_S(ArmEmitterContext context)
{
EmitScalarShrImmOpSx(context, ShrImmFlags.Round);
}
public static void Srshr_V(ArmEmitterContext context)
{
OpCodeSimdShImm op = (OpCodeSimdShImm)context.CurrOp;
if (Optimizations.UseSse2 && op.Size > 0 && op.Size < 3)
{
int shift = GetImmShr(op);
int eSize = 8 << op.Size;
Operand n = GetVec(op.Rn);
Intrinsic sllInst = X86PsllInstruction[op.Size];
Operand res = context.AddIntrinsic(sllInst, n, Const(eSize - shift));
Intrinsic srlInst = X86PsrlInstruction[op.Size];
res = context.AddIntrinsic(srlInst, res, Const(eSize - 1));
Intrinsic sraInst = X86PsraInstruction[op.Size];
Operand nSra = context.AddIntrinsic(sraInst, n, Const(shift));
Intrinsic addInst = X86PaddInstruction[op.Size];
res = context.AddIntrinsic(addInst, res, nSra);
if (op.RegisterSize == RegisterSize.Simd64)
{
res = context.VectorZeroUpper64(res);
}
context.Copy(GetVec(op.Rd), res);
}
else
{
EmitVectorShrImmOpSx(context, ShrImmFlags.Round);
}
}
public static void Srsra_S(ArmEmitterContext context)
{
EmitScalarShrImmOpSx(context, ShrImmFlags.Round | ShrImmFlags.Accumulate);
}
public static void Srsra_V(ArmEmitterContext context)
{
OpCodeSimdShImm op = (OpCodeSimdShImm)context.CurrOp;
if (Optimizations.UseSse2 && op.Size > 0 && op.Size < 3)
{
int shift = GetImmShr(op);
int eSize = 8 << op.Size;
Operand d = GetVec(op.Rd);
Operand n = GetVec(op.Rn);
Intrinsic sllInst = X86PsllInstruction[op.Size];
Operand res = context.AddIntrinsic(sllInst, n, Const(eSize - shift));
Intrinsic srlInst = X86PsrlInstruction[op.Size];
res = context.AddIntrinsic(srlInst, res, Const(eSize - 1));
Intrinsic sraInst = X86PsraInstruction[op.Size];
Operand nSra = context.AddIntrinsic(sraInst, n, Const(shift));
Intrinsic addInst = X86PaddInstruction[op.Size];
res = context.AddIntrinsic(addInst, res, nSra);
res = context.AddIntrinsic(addInst, res, d);
if (op.RegisterSize == RegisterSize.Simd64)
{
res = context.VectorZeroUpper64(res);
}
context.Copy(d, res);
}
else
{
EmitVectorShrImmOpSx(context, ShrImmFlags.Round | ShrImmFlags.Accumulate);
}
}
public static void Sshl_V(ArmEmitterContext context)
{
OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp;
Operand res = context.VectorZero();
int elems = op.GetBytesCount() >> op.Size;
for (int index = 0; index < elems; index++)
{
Operand ne = EmitVectorExtractSx(context, op.Rn, index, op.Size);
Operand me = EmitVectorExtractSx(context, op.Rm, index, op.Size);
Operand e = context.Call(typeof(SoftFallback).GetMethod(nameof(SoftFallback.SignedShlReg)), ne, me, Const(0), Const(op.Size));
res = EmitVectorInsert(context, res, e, index, op.Size);
}
context.Copy(GetVec(op.Rd), res);
}
public static void Sshll_V(ArmEmitterContext context)
{
OpCodeSimdShImm op = (OpCodeSimdShImm)context.CurrOp;
int shift = GetImmShl(op);
if (Optimizations.UseSse41)
{
Operand n = GetVec(op.Rn);
if (op.RegisterSize == RegisterSize.Simd128)
{
n = context.AddIntrinsic(Intrinsic.X86Psrldq, n, Const(8));
}
Intrinsic movsxInst = X86PmovsxInstruction[op.Size];
Operand res = context.AddIntrinsic(movsxInst, n);
if (shift != 0)
{
Intrinsic sllInst = X86PsllInstruction[op.Size + 1];
res = context.AddIntrinsic(sllInst, res, Const(shift));
}
context.Copy(GetVec(op.Rd), res);
}
else
{
EmitVectorShImmWidenBinarySx(context, (op1, op2) => context.ShiftLeft(op1, op2), shift);
}
}
public static void Sshr_S(ArmEmitterContext context)
{
EmitShrImmOp(context, ShrImmFlags.ScalarSx);
}
public static void Sshr_V(ArmEmitterContext context)
{
OpCodeSimdShImm op = (OpCodeSimdShImm)context.CurrOp;
if (Optimizations.UseSse2 && op.Size > 0 && op.Size < 3)
{
int shift = GetImmShr(op);
Operand n = GetVec(op.Rn);
Intrinsic sraInst = X86PsraInstruction[op.Size];
Operand res = context.AddIntrinsic(sraInst, n, Const(shift));
if (op.RegisterSize == RegisterSize.Simd64)
{
res = context.VectorZeroUpper64(res);
}
context.Copy(GetVec(op.Rd), res);
}
else
{
EmitShrImmOp(context, ShrImmFlags.VectorSx);
}
}
public static void Ssra_S(ArmEmitterContext context)
{
EmitScalarShrImmOpSx(context, ShrImmFlags.Accumulate);
}
public static void Ssra_V(ArmEmitterContext context)
{
OpCodeSimdShImm op = (OpCodeSimdShImm)context.CurrOp;
if (Optimizations.UseSse2 && op.Size > 0 && op.Size < 3)
{
int shift = GetImmShr(op);
Operand d = GetVec(op.Rd);
Operand n = GetVec(op.Rn);
Intrinsic sraInst = X86PsraInstruction[op.Size];
Operand res = context.AddIntrinsic(sraInst, n, Const(shift));
Intrinsic addInst = X86PaddInstruction[op.Size];
res = context.AddIntrinsic(addInst, res, d);
if (op.RegisterSize == RegisterSize.Simd64)
{
res = context.VectorZeroUpper64(res);
}
context.Copy(d, res);
}
else
{
EmitVectorShrImmOpSx(context, ShrImmFlags.Accumulate);
}
}
public static void Uqrshl_V(ArmEmitterContext context)
{
OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp;
Operand res = context.VectorZero();
int elems = op.GetBytesCount() >> op.Size;
for (int index = 0; index < elems; index++)
{
Operand ne = EmitVectorExtractZx(context, op.Rn, index, op.Size);
Operand me = EmitVectorExtractZx(context, op.Rm, index, op.Size);
Operand e = context.Call(typeof(SoftFallback).GetMethod(nameof(SoftFallback.UnsignedShlRegSatQ)), ne, me, Const(1), Const(op.Size));
res = EmitVectorInsert(context, res, e, index, op.Size);
}
context.Copy(GetVec(op.Rd), res);
}
public static void Uqrshrn_S(ArmEmitterContext context)
{
EmitRoundShrImmSaturatingNarrowOp(context, ShrImmSaturatingNarrowFlags.ScalarZxZx);
}
public static void Uqrshrn_V(ArmEmitterContext context)
{
EmitRoundShrImmSaturatingNarrowOp(context, ShrImmSaturatingNarrowFlags.VectorZxZx);
}
public static void Uqshl_V(ArmEmitterContext context)
{
OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp;
Operand res = context.VectorZero();
int elems = op.GetBytesCount() >> op.Size;
for (int index = 0; index < elems; index++)
{
Operand ne = EmitVectorExtractZx(context, op.Rn, index, op.Size);
Operand me = EmitVectorExtractZx(context, op.Rm, index, op.Size);
Operand e = context.Call(typeof(SoftFallback).GetMethod(nameof(SoftFallback.UnsignedShlRegSatQ)), ne, me, Const(0), Const(op.Size));
res = EmitVectorInsert(context, res, e, index, op.Size);
}
context.Copy(GetVec(op.Rd), res);
}
public static void Uqshrn_S(ArmEmitterContext context)
{
EmitShrImmSaturatingNarrowOp(context, ShrImmSaturatingNarrowFlags.ScalarZxZx);
}
public static void Uqshrn_V(ArmEmitterContext context)
{
EmitShrImmSaturatingNarrowOp(context, ShrImmSaturatingNarrowFlags.VectorZxZx);
}
public static void Urshl_V(ArmEmitterContext context)
{
OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp;
Operand res = context.VectorZero();
int elems = op.GetBytesCount() >> op.Size;
for (int index = 0; index < elems; index++)
{
Operand ne = EmitVectorExtractZx(context, op.Rn, index, op.Size);
Operand me = EmitVectorExtractZx(context, op.Rm, index, op.Size);
Operand e = context.Call(typeof(SoftFallback).GetMethod(nameof(SoftFallback.UnsignedShlReg)), ne, me, Const(1), Const(op.Size));
res = EmitVectorInsert(context, res, e, index, op.Size);
}
context.Copy(GetVec(op.Rd), res);
}
public static void Urshr_S(ArmEmitterContext context)
{
EmitScalarShrImmOpZx(context, ShrImmFlags.Round);
}
public static void Urshr_V(ArmEmitterContext context)
{
OpCodeSimdShImm op = (OpCodeSimdShImm)context.CurrOp;
if (Optimizations.UseSse2 && op.Size > 0)
{
int shift = GetImmShr(op);
int eSize = 8 << op.Size;
Operand n = GetVec(op.Rn);
Intrinsic sllInst = X86PsllInstruction[op.Size];
Operand res = context.AddIntrinsic(sllInst, n, Const(eSize - shift));
Intrinsic srlInst = X86PsrlInstruction[op.Size];
res = context.AddIntrinsic(srlInst, res, Const(eSize - 1));
Operand nSrl = context.AddIntrinsic(srlInst, n, Const(shift));
Intrinsic addInst = X86PaddInstruction[op.Size];
res = context.AddIntrinsic(addInst, res, nSrl);
if (op.RegisterSize == RegisterSize.Simd64)
{
res = context.VectorZeroUpper64(res);
}
context.Copy(GetVec(op.Rd), res);
}
else
{
EmitVectorShrImmOpZx(context, ShrImmFlags.Round);
}
}
public static void Ursra_S(ArmEmitterContext context)
{
EmitScalarShrImmOpZx(context, ShrImmFlags.Round | ShrImmFlags.Accumulate);
}
public static void Ursra_V(ArmEmitterContext context)
{
OpCodeSimdShImm op = (OpCodeSimdShImm)context.CurrOp;
if (Optimizations.UseSse2 && op.Size > 0)
{
int shift = GetImmShr(op);
int eSize = 8 << op.Size;
Operand d = GetVec(op.Rd);
Operand n = GetVec(op.Rn);
Intrinsic sllInst = X86PsllInstruction[op.Size];
Operand res = context.AddIntrinsic(sllInst, n, Const(eSize - shift));
Intrinsic srlInst = X86PsrlInstruction[op.Size];
res = context.AddIntrinsic(srlInst, res, Const(eSize - 1));
Operand nSrl = context.AddIntrinsic(srlInst, n, Const(shift));
Intrinsic addInst = X86PaddInstruction[op.Size];
res = context.AddIntrinsic(addInst, res, nSrl);
res = context.AddIntrinsic(addInst, res, d);
if (op.RegisterSize == RegisterSize.Simd64)
{
res = context.VectorZeroUpper64(res);
}
context.Copy(d, res);
}
else
{
EmitVectorShrImmOpZx(context, ShrImmFlags.Round | ShrImmFlags.Accumulate);
}
}
public static void Ushl_V(ArmEmitterContext context)
{
OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp;
Operand res = context.VectorZero();
int elems = op.GetBytesCount() >> op.Size;
for (int index = 0; index < elems; index++)
{
Operand ne = EmitVectorExtractZx(context, op.Rn, index, op.Size);
Operand me = EmitVectorExtractSx(context, op.Rm, index << op.Size, 0);
Operand e = EmitUnsignedShlRegOp(context, ne, context.ConvertI64ToI32(me), op.Size);
res = EmitVectorInsert(context, res, e, index, op.Size);
}
context.Copy(GetVec(op.Rd), res);
}
public static void Ushll_V(ArmEmitterContext context)
{
OpCodeSimdShImm op = (OpCodeSimdShImm)context.CurrOp;
int shift = GetImmShl(op);
if (Optimizations.UseSse41)
{
Operand n = GetVec(op.Rn);
if (op.RegisterSize == RegisterSize.Simd128)
{
n = context.AddIntrinsic(Intrinsic.X86Psrldq, n, Const(8));
}
Intrinsic movzxInst = X86PmovzxInstruction[op.Size];
Operand res = context.AddIntrinsic(movzxInst, n);
if (shift != 0)
{
Intrinsic sllInst = X86PsllInstruction[op.Size + 1];
res = context.AddIntrinsic(sllInst, res, Const(shift));
}
context.Copy(GetVec(op.Rd), res);
}
else
{
EmitVectorShImmWidenBinaryZx(context, (op1, op2) => context.ShiftLeft(op1, op2), shift);
}
}
public static void Ushr_S(ArmEmitterContext context)
{
EmitShrImmOp(context, ShrImmFlags.ScalarZx);
}
public static void Ushr_V(ArmEmitterContext context)
{
OpCodeSimdShImm op = (OpCodeSimdShImm)context.CurrOp;
if (Optimizations.UseSse2 && op.Size > 0)
{
int shift = GetImmShr(op);
Operand n = GetVec(op.Rn);
Intrinsic srlInst = X86PsrlInstruction[op.Size];
Operand res = context.AddIntrinsic(srlInst, n, Const(shift));
if (op.RegisterSize == RegisterSize.Simd64)
{
res = context.VectorZeroUpper64(res);
}
context.Copy(GetVec(op.Rd), res);
}
else
{
EmitShrImmOp(context, ShrImmFlags.VectorZx);
}
}
public static void Usra_S(ArmEmitterContext context)
{
EmitScalarShrImmOpZx(context, ShrImmFlags.Accumulate);
}
public static void Usra_V(ArmEmitterContext context)
{
OpCodeSimdShImm op = (OpCodeSimdShImm)context.CurrOp;
if (Optimizations.UseSse2 && op.Size > 0)
{
int shift = GetImmShr(op);
Operand d = GetVec(op.Rd);
Operand n = GetVec(op.Rn);
Intrinsic srlInst = X86PsrlInstruction[op.Size];
Operand res = context.AddIntrinsic(srlInst, n, Const(shift));
Intrinsic addInst = X86PaddInstruction[op.Size];
res = context.AddIntrinsic(addInst, res, d);
if (op.RegisterSize == RegisterSize.Simd64)
{
res = context.VectorZeroUpper64(res);
}
context.Copy(d, res);
}
else
{
EmitVectorShrImmOpZx(context, ShrImmFlags.Accumulate);
}
}
[Flags]
private enum ShrImmFlags
{
Scalar = 1 << 0,
Signed = 1 << 1,
Round = 1 << 2,
Accumulate = 1 << 3,
ScalarSx = Scalar | Signed,
ScalarZx = Scalar,
VectorSx = Signed,
VectorZx = 0
}
private static void EmitScalarShrImmOpSx(ArmEmitterContext context, ShrImmFlags flags)
{
EmitShrImmOp(context, ShrImmFlags.ScalarSx | flags);
}
private static void EmitScalarShrImmOpZx(ArmEmitterContext context, ShrImmFlags flags)
{
EmitShrImmOp(context, ShrImmFlags.ScalarZx | flags);
}
private static void EmitVectorShrImmOpSx(ArmEmitterContext context, ShrImmFlags flags)
{
EmitShrImmOp(context, ShrImmFlags.VectorSx | flags);
}
private static void EmitVectorShrImmOpZx(ArmEmitterContext context, ShrImmFlags flags)
{
EmitShrImmOp(context, ShrImmFlags.VectorZx | flags);
}
private static void EmitShrImmOp(ArmEmitterContext context, ShrImmFlags flags)
{
OpCodeSimdShImm op = (OpCodeSimdShImm)context.CurrOp;
Operand res = context.VectorZero();
bool scalar = (flags & ShrImmFlags.Scalar) != 0;
bool signed = (flags & ShrImmFlags.Signed) != 0;
bool round = (flags & ShrImmFlags.Round) != 0;
bool accumulate = (flags & ShrImmFlags.Accumulate) != 0;
int shift = GetImmShr(op);
long roundConst = 1L << (shift - 1);
int elems = !scalar ? op.GetBytesCount() >> op.Size : 1;
for (int index = 0; index < elems; index++)
{
Operand e = EmitVectorExtract(context, op.Rn, index, op.Size, signed);
if (op.Size <= 2)
{
if (round)
{
e = context.Add(e, Const(roundConst));
}
e = signed ? context.ShiftRightSI(e, Const(shift)) : context.ShiftRightUI(e, Const(shift));
}
else /* if (op.Size == 3) */
{
e = EmitShrImm64(context, e, signed, round ? roundConst : 0L, shift);
}
if (accumulate)
{
Operand de = EmitVectorExtract(context, op.Rd, index, op.Size, signed);
e = context.Add(e, de);
}
res = EmitVectorInsert(context, res, e, index, op.Size);
}
context.Copy(GetVec(op.Rd), res);
}
private static Operand EmitUnsignedShlRegOp(ArmEmitterContext context, Operand op, Operand shiftLsB, int size)
{
Debug.Assert(op.Type == OperandType.I64);
Debug.Assert(shiftLsB.Type == OperandType.I32);
Debug.Assert((uint)size < 4u);
Operand negShiftLsB = context.Negate(shiftLsB);
Operand isPositive = context.ICompareGreaterOrEqual(shiftLsB, Const(0));
Operand shl = context.ShiftLeft (op, shiftLsB);
Operand shr = context.ShiftRightUI(op, negShiftLsB);
Operand res = context.ConditionalSelect(isPositive, shl, shr);
Operand isOutOfRange = context.BitwiseOr(
context.ICompareGreaterOrEqual(shiftLsB, Const(8 << size)),
context.ICompareGreaterOrEqual(negShiftLsB, Const(8 << size)));
return context.ConditionalSelect(isOutOfRange, Const(0UL), res);
}
private static void EmitVectorShrImmNarrowOpZx(ArmEmitterContext context, bool round)
{
OpCodeSimdShImm op = (OpCodeSimdShImm)context.CurrOp;
int shift = GetImmShr(op);
long roundConst = 1L << (shift - 1);
int elems = 8 >> op.Size;
int part = op.RegisterSize == RegisterSize.Simd128 ? elems : 0;
Operand d = GetVec(op.Rd);
Operand res = part == 0 ? context.VectorZero() : context.Copy(d);
for (int index = 0; index < elems; index++)
{
Operand e = EmitVectorExtractZx(context, op.Rn, index, op.Size + 1);
if (round)
{
e = context.Add(e, Const(roundConst));
}
e = context.ShiftRightUI(e, Const(shift));
res = EmitVectorInsert(context, res, e, part + index, op.Size);
}
context.Copy(d, res);
}
[Flags]
private enum ShrImmSaturatingNarrowFlags
{
Scalar = 1 << 0,
SignedSrc = 1 << 1,
SignedDst = 1 << 2,
Round = 1 << 3,
ScalarSxSx = Scalar | SignedSrc | SignedDst,
ScalarSxZx = Scalar | SignedSrc,
ScalarZxZx = Scalar,
VectorSxSx = SignedSrc | SignedDst,
VectorSxZx = SignedSrc,
VectorZxZx = 0
}
private static void EmitRoundShrImmSaturatingNarrowOp(ArmEmitterContext context, ShrImmSaturatingNarrowFlags flags)
{
EmitShrImmSaturatingNarrowOp(context, ShrImmSaturatingNarrowFlags.Round | flags);
}
private static void EmitShrImmSaturatingNarrowOp(ArmEmitterContext context, ShrImmSaturatingNarrowFlags flags)
{
OpCodeSimdShImm op = (OpCodeSimdShImm)context.CurrOp;
bool scalar = (flags & ShrImmSaturatingNarrowFlags.Scalar) != 0;
bool signedSrc = (flags & ShrImmSaturatingNarrowFlags.SignedSrc) != 0;
bool signedDst = (flags & ShrImmSaturatingNarrowFlags.SignedDst) != 0;
bool round = (flags & ShrImmSaturatingNarrowFlags.Round) != 0;
int shift = GetImmShr(op);
long roundConst = 1L << (shift - 1);
int elems = !scalar ? 8 >> op.Size : 1;
int part = !scalar && (op.RegisterSize == RegisterSize.Simd128) ? elems : 0;
Operand d = GetVec(op.Rd);
Operand res = part == 0 ? context.VectorZero() : context.Copy(d);
for (int index = 0; index < elems; index++)
{
Operand e = EmitVectorExtract(context, op.Rn, index, op.Size + 1, signedSrc);
if (op.Size <= 1 || !round)
{
if (round)
{
e = context.Add(e, Const(roundConst));
}
e = signedSrc ? context.ShiftRightSI(e, Const(shift)) : context.ShiftRightUI(e, Const(shift));
}
else /* if (op.Size == 2 && round) */
{
e = EmitShrImm64(context, e, signedSrc, roundConst, shift); // shift <= 32
}
e = EmitSatQ(context, e, op.Size, signedSrc, signedDst);
res = EmitVectorInsert(context, res, e, part + index, op.Size);
}
context.Copy(d, res);
}
// dst64 = (Int(src64, signed) + roundConst) >> shift;
private static Operand EmitShrImm64(
ArmEmitterContext context,
Operand value,
bool signed,
long roundConst,
int shift)
{
MethodInfo info = signed
? typeof(SoftFallback).GetMethod(nameof(SoftFallback.SignedShrImm64))
: typeof(SoftFallback).GetMethod(nameof(SoftFallback.UnsignedShrImm64));
return context.Call(info, value, Const(roundConst), Const(shift));
}
private static void EmitVectorShImmWidenBinarySx(ArmEmitterContext context, Func2I emit, int imm)
{
EmitVectorShImmWidenBinaryOp(context, emit, imm, signed: true);
}
private static void EmitVectorShImmWidenBinaryZx(ArmEmitterContext context, Func2I emit, int imm)
{
EmitVectorShImmWidenBinaryOp(context, emit, imm, signed: false);
}
private static void EmitVectorShImmWidenBinaryOp(ArmEmitterContext context, Func2I emit, int imm, bool signed)
{
OpCodeSimd op = (OpCodeSimd)context.CurrOp;
Operand res = context.VectorZero();
int elems = 8 >> op.Size;
int part = op.RegisterSize == RegisterSize.Simd128 ? elems : 0;
for (int index = 0; index < elems; index++)
{
Operand ne = EmitVectorExtract(context, op.Rn, part + index, op.Size, signed);
res = EmitVectorInsert(context, res, emit(ne, Const(imm)), index, op.Size + 1);
}
context.Copy(GetVec(op.Rd), res);
}
private static void EmitSli(ArmEmitterContext context, bool scalar)
{
OpCodeSimdShImm op = (OpCodeSimdShImm)context.CurrOp;
int shift = GetImmShl(op);
ulong mask = shift != 0 ? ulong.MaxValue >> (64 - shift) : 0UL;
if (Optimizations.UseSse2 && op.Size > 0)
{
Operand d = GetVec(op.Rd);
Operand n = GetVec(op.Rn);
Intrinsic sllInst = X86PsllInstruction[op.Size];
Operand nShifted = context.AddIntrinsic(sllInst, n, Const(shift));
Operand dMask = X86GetAllElements(context, (long)mask * _masks_SliSri[op.Size]);
Operand dMasked = context.AddIntrinsic(Intrinsic.X86Pand, d, dMask);
Operand res = context.AddIntrinsic(Intrinsic.X86Por, nShifted, dMasked);
if ((op.RegisterSize == RegisterSize.Simd64) || scalar)
{
res = context.VectorZeroUpper64(res);
}
context.Copy(d, res);
}
else
{
Operand res = context.VectorZero();
int elems = !scalar ? op.GetBytesCount() >> op.Size : 1;
for (int index = 0; index < elems; index++)
{
Operand ne = EmitVectorExtractZx(context, op.Rn, index, op.Size);
Operand neShifted = context.ShiftLeft(ne, Const(shift));
Operand de = EmitVectorExtractZx(context, op.Rd, index, op.Size);
Operand deMasked = context.BitwiseAnd(de, Const(mask));
Operand e = context.BitwiseOr(neShifted, deMasked);
res = EmitVectorInsert(context, res, e, index, op.Size);
}
context.Copy(GetVec(op.Rd), res);
}
}
private static void EmitSri(ArmEmitterContext context, bool scalar)
{
OpCodeSimdShImm op = (OpCodeSimdShImm)context.CurrOp;
int shift = GetImmShr(op);
int eSize = 8 << op.Size;
ulong mask = (ulong.MaxValue << (eSize - shift)) & (ulong.MaxValue >> (64 - eSize));
if (Optimizations.UseSse2 && op.Size > 0)
{
Operand d = GetVec(op.Rd);
Operand n = GetVec(op.Rn);
Intrinsic srlInst = X86PsrlInstruction[op.Size];
Operand nShifted = context.AddIntrinsic(srlInst, n, Const(shift));
Operand dMask = X86GetAllElements(context, (long)mask * _masks_SliSri[op.Size]);
Operand dMasked = context.AddIntrinsic(Intrinsic.X86Pand, d, dMask);
Operand res = context.AddIntrinsic(Intrinsic.X86Por, nShifted, dMasked);
if ((op.RegisterSize == RegisterSize.Simd64) || scalar)
{
res = context.VectorZeroUpper64(res);
}
context.Copy(d, res);
}
else
{
Operand res = context.VectorZero();
int elems = !scalar ? op.GetBytesCount() >> op.Size : 1;
for (int index = 0; index < elems; index++)
{
Operand ne = EmitVectorExtractZx(context, op.Rn, index, op.Size);
Operand neShifted = shift != 64 ? context.ShiftRightUI(ne, Const(shift)) : Const(0UL);
Operand de = EmitVectorExtractZx(context, op.Rd, index, op.Size);
Operand deMasked = context.BitwiseAnd(de, Const(mask));
Operand e = context.BitwiseOr(neShifted, deMasked);
res = EmitVectorInsert(context, res, e, index, op.Size);
}
context.Copy(GetVec(op.Rd), res);
}
}
}
}