0
0
Fork 0
mirror of https://github.com/GreemDev/Ryujinx.git synced 2025-01-03 19:52:00 +00:00

Add Smlal_Ve, Smlsl_Ve, Smull_Ve, Umlal_Ve, Umlsl_Ve, Umull_Ve Inst.; add Tests. Add Sse Opt. for Trn1/2_V and Uzp1/2_V Inst. Nits. (#566)

* Update OpCodeTable.cs

* Update InstEmitSimdArithmetic.cs

* Update InstEmitSimdHelper.cs

* Update CpuTestSimdRegElem.cs

* Update InstEmitSimdMove.cs

* Update InstEmitSimdCvt.cs

* Update SoftFallback.cs

* Update InstEmitSimdHelper.cs

* Update SoftFloat.cs

* Update CryptoHelper.cs

* Update InstEmitSimdArithmetic.cs

* Update InstEmitSimdCmp.cs

* Address PR feedback.

* Address PR feedback.
This commit is contained in:
LDj3SNuD 2019-01-29 14:54:39 +01:00 committed by gdkchan
parent 36b9ab0e48
commit 8f7fcede7f
10 changed files with 453 additions and 175 deletions

View file

@ -9,7 +9,7 @@ namespace ChocolArm64.Instructions
static class CryptoHelper static class CryptoHelper
{ {
#region "LookUp Tables" #region "LookUp Tables"
private static byte[] _sBox = private static readonly byte[] _sBox = new byte[]
{ {
0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76, 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76,
0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0, 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0,
@ -29,7 +29,7 @@ namespace ChocolArm64.Instructions
0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
}; };
private static byte[] _invSBox = private static readonly byte[] _invSBox = new byte[]
{ {
0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38, 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb, 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38, 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb,
0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87, 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb, 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87, 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb,
@ -49,7 +49,7 @@ namespace ChocolArm64.Instructions
0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26, 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26, 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
}; };
private static byte[] _gfMul02 = private static readonly byte[] _gfMul02 = new byte[]
{ {
0x00, 0x02, 0x04, 0x06, 0x08, 0x0a, 0x0c, 0x0e, 0x10, 0x12, 0x14, 0x16, 0x18, 0x1a, 0x1c, 0x1e, 0x00, 0x02, 0x04, 0x06, 0x08, 0x0a, 0x0c, 0x0e, 0x10, 0x12, 0x14, 0x16, 0x18, 0x1a, 0x1c, 0x1e,
0x20, 0x22, 0x24, 0x26, 0x28, 0x2a, 0x2c, 0x2e, 0x30, 0x32, 0x34, 0x36, 0x38, 0x3a, 0x3c, 0x3e, 0x20, 0x22, 0x24, 0x26, 0x28, 0x2a, 0x2c, 0x2e, 0x30, 0x32, 0x34, 0x36, 0x38, 0x3a, 0x3c, 0x3e,
@ -69,7 +69,7 @@ namespace ChocolArm64.Instructions
0xfb, 0xf9, 0xff, 0xfd, 0xf3, 0xf1, 0xf7, 0xf5, 0xeb, 0xe9, 0xef, 0xed, 0xe3, 0xe1, 0xe7, 0xe5 0xfb, 0xf9, 0xff, 0xfd, 0xf3, 0xf1, 0xf7, 0xf5, 0xeb, 0xe9, 0xef, 0xed, 0xe3, 0xe1, 0xe7, 0xe5
}; };
private static byte[] _gfMul03 = private static readonly byte[] _gfMul03 = new byte[]
{ {
0x00, 0x03, 0x06, 0x05, 0x0c, 0x0f, 0x0a, 0x09, 0x18, 0x1b, 0x1e, 0x1d, 0x14, 0x17, 0x12, 0x11, 0x00, 0x03, 0x06, 0x05, 0x0c, 0x0f, 0x0a, 0x09, 0x18, 0x1b, 0x1e, 0x1d, 0x14, 0x17, 0x12, 0x11,
0x30, 0x33, 0x36, 0x35, 0x3c, 0x3f, 0x3a, 0x39, 0x28, 0x2b, 0x2e, 0x2d, 0x24, 0x27, 0x22, 0x21, 0x30, 0x33, 0x36, 0x35, 0x3c, 0x3f, 0x3a, 0x39, 0x28, 0x2b, 0x2e, 0x2d, 0x24, 0x27, 0x22, 0x21,
@ -89,7 +89,7 @@ namespace ChocolArm64.Instructions
0x0b, 0x08, 0x0d, 0x0e, 0x07, 0x04, 0x01, 0x02, 0x13, 0x10, 0x15, 0x16, 0x1f, 0x1c, 0x19, 0x1a 0x0b, 0x08, 0x0d, 0x0e, 0x07, 0x04, 0x01, 0x02, 0x13, 0x10, 0x15, 0x16, 0x1f, 0x1c, 0x19, 0x1a
}; };
private static byte[] _gfMul09 = private static readonly byte[] _gfMul09 = new byte[]
{ {
0x00, 0x09, 0x12, 0x1b, 0x24, 0x2d, 0x36, 0x3f, 0x48, 0x41, 0x5a, 0x53, 0x6c, 0x65, 0x7e, 0x77, 0x00, 0x09, 0x12, 0x1b, 0x24, 0x2d, 0x36, 0x3f, 0x48, 0x41, 0x5a, 0x53, 0x6c, 0x65, 0x7e, 0x77,
0x90, 0x99, 0x82, 0x8b, 0xb4, 0xbd, 0xa6, 0xaf, 0xd8, 0xd1, 0xca, 0xc3, 0xfc, 0xf5, 0xee, 0xe7, 0x90, 0x99, 0x82, 0x8b, 0xb4, 0xbd, 0xa6, 0xaf, 0xd8, 0xd1, 0xca, 0xc3, 0xfc, 0xf5, 0xee, 0xe7,
@ -109,7 +109,7 @@ namespace ChocolArm64.Instructions
0x31, 0x38, 0x23, 0x2a, 0x15, 0x1c, 0x07, 0x0e, 0x79, 0x70, 0x6b, 0x62, 0x5d, 0x54, 0x4f, 0x46 0x31, 0x38, 0x23, 0x2a, 0x15, 0x1c, 0x07, 0x0e, 0x79, 0x70, 0x6b, 0x62, 0x5d, 0x54, 0x4f, 0x46
}; };
private static byte[] _gfMul0B = private static readonly byte[] _gfMul0B = new byte[]
{ {
0x00, 0x0b, 0x16, 0x1d, 0x2c, 0x27, 0x3a, 0x31, 0x58, 0x53, 0x4e, 0x45, 0x74, 0x7f, 0x62, 0x69, 0x00, 0x0b, 0x16, 0x1d, 0x2c, 0x27, 0x3a, 0x31, 0x58, 0x53, 0x4e, 0x45, 0x74, 0x7f, 0x62, 0x69,
0xb0, 0xbb, 0xa6, 0xad, 0x9c, 0x97, 0x8a, 0x81, 0xe8, 0xe3, 0xfe, 0xf5, 0xc4, 0xcf, 0xd2, 0xd9, 0xb0, 0xbb, 0xa6, 0xad, 0x9c, 0x97, 0x8a, 0x81, 0xe8, 0xe3, 0xfe, 0xf5, 0xc4, 0xcf, 0xd2, 0xd9,
@ -129,7 +129,7 @@ namespace ChocolArm64.Instructions
0xca, 0xc1, 0xdc, 0xd7, 0xe6, 0xed, 0xf0, 0xfb, 0x92, 0x99, 0x84, 0x8f, 0xbe, 0xb5, 0xa8, 0xa3 0xca, 0xc1, 0xdc, 0xd7, 0xe6, 0xed, 0xf0, 0xfb, 0x92, 0x99, 0x84, 0x8f, 0xbe, 0xb5, 0xa8, 0xa3
}; };
private static byte[] _gfMul0D = private static readonly byte[] _gfMul0D = new byte[]
{ {
0x00, 0x0d, 0x1a, 0x17, 0x34, 0x39, 0x2e, 0x23, 0x68, 0x65, 0x72, 0x7f, 0x5c, 0x51, 0x46, 0x4b, 0x00, 0x0d, 0x1a, 0x17, 0x34, 0x39, 0x2e, 0x23, 0x68, 0x65, 0x72, 0x7f, 0x5c, 0x51, 0x46, 0x4b,
0xd0, 0xdd, 0xca, 0xc7, 0xe4, 0xe9, 0xfe, 0xf3, 0xb8, 0xb5, 0xa2, 0xaf, 0x8c, 0x81, 0x96, 0x9b, 0xd0, 0xdd, 0xca, 0xc7, 0xe4, 0xe9, 0xfe, 0xf3, 0xb8, 0xb5, 0xa2, 0xaf, 0x8c, 0x81, 0x96, 0x9b,
@ -149,7 +149,7 @@ namespace ChocolArm64.Instructions
0xdc, 0xd1, 0xc6, 0xcb, 0xe8, 0xe5, 0xf2, 0xff, 0xb4, 0xb9, 0xae, 0xa3, 0x80, 0x8d, 0x9a, 0x97 0xdc, 0xd1, 0xc6, 0xcb, 0xe8, 0xe5, 0xf2, 0xff, 0xb4, 0xb9, 0xae, 0xa3, 0x80, 0x8d, 0x9a, 0x97
}; };
private static byte[] _gfMul0E = private static readonly byte[] _gfMul0E = new byte[]
{ {
0x00, 0x0e, 0x1c, 0x12, 0x38, 0x36, 0x24, 0x2a, 0x70, 0x7e, 0x6c, 0x62, 0x48, 0x46, 0x54, 0x5a, 0x00, 0x0e, 0x1c, 0x12, 0x38, 0x36, 0x24, 0x2a, 0x70, 0x7e, 0x6c, 0x62, 0x48, 0x46, 0x54, 0x5a,
0xe0, 0xee, 0xfc, 0xf2, 0xd8, 0xd6, 0xc4, 0xca, 0x90, 0x9e, 0x8c, 0x82, 0xa8, 0xa6, 0xb4, 0xba, 0xe0, 0xee, 0xfc, 0xf2, 0xd8, 0xd6, 0xc4, 0xca, 0x90, 0x9e, 0x8c, 0x82, 0xa8, 0xa6, 0xb4, 0xba,
@ -169,9 +169,15 @@ namespace ChocolArm64.Instructions
0xd7, 0xd9, 0xcb, 0xc5, 0xef, 0xe1, 0xf3, 0xfd, 0xa7, 0xa9, 0xbb, 0xb5, 0x9f, 0x91, 0x83, 0x8d 0xd7, 0xd9, 0xcb, 0xc5, 0xef, 0xe1, 0xf3, 0xfd, 0xa7, 0xa9, 0xbb, 0xb5, 0x9f, 0x91, 0x83, 0x8d
}; };
private static byte[] _srPerm = { 0, 13, 10, 7, 4, 1, 14, 11, 8, 5, 2, 15, 12, 9, 6, 3 }; private static readonly byte[] _srPerm = new byte[]
{
0, 13, 10, 7, 4, 1, 14, 11, 8, 5, 2, 15, 12, 9, 6, 3
};
private static byte[] _isrPerm = { 0, 5, 10, 15, 4, 9, 14, 3, 8, 13, 2, 7, 12, 1, 6, 11 }; private static readonly byte[] _isrPerm = new byte[]
{
0, 5, 10, 15, 4, 9, 14, 3, 8, 13, 2, 7, 12, 1, 6, 11
};
#endregion #endregion
public static Vector128<float> AesInvMixColumns(Vector128<float> op) public static Vector128<float> AesInvMixColumns(Vector128<float> op)
@ -179,7 +185,7 @@ namespace ChocolArm64.Instructions
byte[] inState = new byte[16]; byte[] inState = new byte[16];
byte[] outState = new byte[16]; byte[] outState = new byte[16];
FromVectorToByteArray(inState, ref op); FromVectorToByteArray(op, inState);
for (int columns = 0; columns <= 3; columns++) for (int columns = 0; columns <= 3; columns++)
{ {
@ -206,7 +212,7 @@ namespace ChocolArm64.Instructions
byte[] inState = new byte[16]; byte[] inState = new byte[16];
byte[] outState = new byte[16]; byte[] outState = new byte[16];
FromVectorToByteArray(inState, ref op); FromVectorToByteArray(op, inState);
for (int idx = 0; idx <= 15; idx++) for (int idx = 0; idx <= 15; idx++)
{ {
@ -223,7 +229,7 @@ namespace ChocolArm64.Instructions
byte[] inState = new byte[16]; byte[] inState = new byte[16];
byte[] outState = new byte[16]; byte[] outState = new byte[16];
FromVectorToByteArray(inState, ref op); FromVectorToByteArray(op, inState);
for (int idx = 0; idx <= 15; idx++) for (int idx = 0; idx <= 15; idx++)
{ {
@ -240,7 +246,7 @@ namespace ChocolArm64.Instructions
byte[] inState = new byte[16]; byte[] inState = new byte[16];
byte[] outState = new byte[16]; byte[] outState = new byte[16];
FromVectorToByteArray(inState, ref op); FromVectorToByteArray(op, inState);
for (int columns = 0; columns <= 3; columns++) for (int columns = 0; columns <= 3; columns++)
{ {
@ -267,7 +273,7 @@ namespace ChocolArm64.Instructions
byte[] inState = new byte[16]; byte[] inState = new byte[16];
byte[] outState = new byte[16]; byte[] outState = new byte[16];
FromVectorToByteArray(inState, ref op); FromVectorToByteArray(op, inState);
for (int idx = 0; idx <= 15; idx++) for (int idx = 0; idx <= 15; idx++)
{ {
@ -284,7 +290,7 @@ namespace ChocolArm64.Instructions
byte[] inState = new byte[16]; byte[] inState = new byte[16];
byte[] outState = new byte[16]; byte[] outState = new byte[16];
FromVectorToByteArray(inState, ref op); FromVectorToByteArray(op, inState);
for (int idx = 0; idx <= 15; idx++) for (int idx = 0; idx <= 15; idx++)
{ {
@ -296,33 +302,30 @@ namespace ChocolArm64.Instructions
return op; return op;
} }
private static void FromVectorToByteArray(byte[] state, ref Vector128<float> op) private unsafe static void FromVectorToByteArray(Vector128<float> op, byte[] state)
{
ulong uLongLow = VectorHelper.VectorExtractIntZx((op), (byte)0, 3);
ulong uLongHigh = VectorHelper.VectorExtractIntZx((op), (byte)1, 3);
for (int idx = 0; idx <= 7; idx++)
{
state[idx + 0] = (byte)(uLongLow & 0xFFUL);
state[idx + 8] = (byte)(uLongHigh & 0xFFUL);
uLongLow >>= 8;
uLongHigh >>= 8;
}
}
private static void FromByteArrayToVector(byte[] state, ref Vector128<float> op)
{ {
if (!Sse2.IsSupported) if (!Sse2.IsSupported)
{ {
throw new PlatformNotSupportedException(); throw new PlatformNotSupportedException();
} }
op = Sse.StaticCast<byte, float>(Sse2.SetVector128( fixed (byte* ptr = &state[0])
state[15], state[14], state[13], state[12], {
state[11], state[10], state[9], state[8], Sse2.Store(ptr, Sse.StaticCast<float, byte>(op));
state[7], state[6], state[5], state[4], }
state[3], state[2], state[1], state[0])); }
private unsafe static void FromByteArrayToVector(byte[] state, ref Vector128<float> op)
{
if (!Sse2.IsSupported)
{
throw new PlatformNotSupportedException();
}
fixed (byte* ptr = &state[0])
{
op = Sse.StaticCast<byte, float>(Sse2.LoadVector128(ptr));
}
} }
} }
} }

View file

@ -392,8 +392,7 @@ namespace ChocolArm64.Instructions
public static void Fadd_S(ILEmitterCtx context) public static void Fadd_S(ILEmitterCtx context)
{ {
if (Optimizations.FastFP && Optimizations.UseSse if (Optimizations.FastFP && Optimizations.UseSse2)
&& Optimizations.UseSse2)
{ {
EmitScalarSseOrSse2OpF(context, nameof(Sse.AddScalar)); EmitScalarSseOrSse2OpF(context, nameof(Sse.AddScalar));
} }
@ -408,8 +407,7 @@ namespace ChocolArm64.Instructions
public static void Fadd_V(ILEmitterCtx context) public static void Fadd_V(ILEmitterCtx context)
{ {
if (Optimizations.FastFP && Optimizations.UseSse if (Optimizations.FastFP && Optimizations.UseSse2)
&& Optimizations.UseSse2)
{ {
EmitVectorSseOrSse2OpF(context, nameof(Sse.Add)); EmitVectorSseOrSse2OpF(context, nameof(Sse.Add));
} }
@ -470,8 +468,7 @@ namespace ChocolArm64.Instructions
public static void Faddp_V(ILEmitterCtx context) public static void Faddp_V(ILEmitterCtx context)
{ {
if (Optimizations.FastFP && Optimizations.UseSse if (Optimizations.FastFP && Optimizations.UseSse2)
&& Optimizations.UseSse2)
{ {
EmitVectorPairwiseSseOrSse2OpF(context, nameof(Sse.Add)); EmitVectorPairwiseSseOrSse2OpF(context, nameof(Sse.Add));
} }
@ -486,8 +483,7 @@ namespace ChocolArm64.Instructions
public static void Fdiv_S(ILEmitterCtx context) public static void Fdiv_S(ILEmitterCtx context)
{ {
if (Optimizations.FastFP && Optimizations.UseSse if (Optimizations.FastFP && Optimizations.UseSse2)
&& Optimizations.UseSse2)
{ {
EmitScalarSseOrSse2OpF(context, nameof(Sse.DivideScalar)); EmitScalarSseOrSse2OpF(context, nameof(Sse.DivideScalar));
} }
@ -502,8 +498,7 @@ namespace ChocolArm64.Instructions
public static void Fdiv_V(ILEmitterCtx context) public static void Fdiv_V(ILEmitterCtx context)
{ {
if (Optimizations.FastFP && Optimizations.UseSse if (Optimizations.FastFP && Optimizations.UseSse2)
&& Optimizations.UseSse2)
{ {
EmitVectorSseOrSse2OpF(context, nameof(Sse.Divide)); EmitVectorSseOrSse2OpF(context, nameof(Sse.Divide));
} }
@ -564,8 +559,7 @@ namespace ChocolArm64.Instructions
public static void Fmax_S(ILEmitterCtx context) public static void Fmax_S(ILEmitterCtx context)
{ {
if (Optimizations.FastFP && Optimizations.UseSse if (Optimizations.FastFP && Optimizations.UseSse2)
&& Optimizations.UseSse2)
{ {
EmitScalarSseOrSse2OpF(context, nameof(Sse.MaxScalar)); EmitScalarSseOrSse2OpF(context, nameof(Sse.MaxScalar));
} }
@ -580,8 +574,7 @@ namespace ChocolArm64.Instructions
public static void Fmax_V(ILEmitterCtx context) public static void Fmax_V(ILEmitterCtx context)
{ {
if (Optimizations.FastFP && Optimizations.UseSse if (Optimizations.FastFP && Optimizations.UseSse2)
&& Optimizations.UseSse2)
{ {
EmitVectorSseOrSse2OpF(context, nameof(Sse.Max)); EmitVectorSseOrSse2OpF(context, nameof(Sse.Max));
} }
@ -612,8 +605,7 @@ namespace ChocolArm64.Instructions
public static void Fmaxp_V(ILEmitterCtx context) public static void Fmaxp_V(ILEmitterCtx context)
{ {
if (Optimizations.FastFP && Optimizations.UseSse if (Optimizations.FastFP && Optimizations.UseSse2)
&& Optimizations.UseSse2)
{ {
EmitVectorPairwiseSseOrSse2OpF(context, nameof(Sse.Max)); EmitVectorPairwiseSseOrSse2OpF(context, nameof(Sse.Max));
} }
@ -628,8 +620,7 @@ namespace ChocolArm64.Instructions
public static void Fmin_S(ILEmitterCtx context) public static void Fmin_S(ILEmitterCtx context)
{ {
if (Optimizations.FastFP && Optimizations.UseSse if (Optimizations.FastFP && Optimizations.UseSse2)
&& Optimizations.UseSse2)
{ {
EmitScalarSseOrSse2OpF(context, nameof(Sse.MinScalar)); EmitScalarSseOrSse2OpF(context, nameof(Sse.MinScalar));
} }
@ -644,8 +635,7 @@ namespace ChocolArm64.Instructions
public static void Fmin_V(ILEmitterCtx context) public static void Fmin_V(ILEmitterCtx context)
{ {
if (Optimizations.FastFP && Optimizations.UseSse if (Optimizations.FastFP && Optimizations.UseSse2)
&& Optimizations.UseSse2)
{ {
EmitVectorSseOrSse2OpF(context, nameof(Sse.Min)); EmitVectorSseOrSse2OpF(context, nameof(Sse.Min));
} }
@ -676,8 +666,7 @@ namespace ChocolArm64.Instructions
public static void Fminp_V(ILEmitterCtx context) public static void Fminp_V(ILEmitterCtx context)
{ {
if (Optimizations.FastFP && Optimizations.UseSse if (Optimizations.FastFP && Optimizations.UseSse2)
&& Optimizations.UseSse2)
{ {
EmitVectorPairwiseSseOrSse2OpF(context, nameof(Sse.Min)); EmitVectorPairwiseSseOrSse2OpF(context, nameof(Sse.Min));
} }
@ -984,8 +973,7 @@ namespace ChocolArm64.Instructions
public static void Fmul_S(ILEmitterCtx context) public static void Fmul_S(ILEmitterCtx context)
{ {
if (Optimizations.FastFP && Optimizations.UseSse if (Optimizations.FastFP && Optimizations.UseSse2)
&& Optimizations.UseSse2)
{ {
EmitScalarSseOrSse2OpF(context, nameof(Sse.MultiplyScalar)); EmitScalarSseOrSse2OpF(context, nameof(Sse.MultiplyScalar));
} }
@ -1005,8 +993,7 @@ namespace ChocolArm64.Instructions
public static void Fmul_V(ILEmitterCtx context) public static void Fmul_V(ILEmitterCtx context)
{ {
if (Optimizations.FastFP && Optimizations.UseSse if (Optimizations.FastFP && Optimizations.UseSse2)
&& Optimizations.UseSse2)
{ {
EmitVectorSseOrSse2OpF(context, nameof(Sse.Multiply)); EmitVectorSseOrSse2OpF(context, nameof(Sse.Multiply));
} }
@ -1753,8 +1740,7 @@ namespace ChocolArm64.Instructions
public static void Fsqrt_S(ILEmitterCtx context) public static void Fsqrt_S(ILEmitterCtx context)
{ {
if (Optimizations.FastFP && Optimizations.UseSse if (Optimizations.FastFP && Optimizations.UseSse2)
&& Optimizations.UseSse2)
{ {
EmitScalarSseOrSse2OpF(context, nameof(Sse.SqrtScalar)); EmitScalarSseOrSse2OpF(context, nameof(Sse.SqrtScalar));
} }
@ -1769,8 +1755,7 @@ namespace ChocolArm64.Instructions
public static void Fsqrt_V(ILEmitterCtx context) public static void Fsqrt_V(ILEmitterCtx context)
{ {
if (Optimizations.FastFP && Optimizations.UseSse if (Optimizations.FastFP && Optimizations.UseSse2)
&& Optimizations.UseSse2)
{ {
EmitVectorSseOrSse2OpF(context, nameof(Sse.Sqrt)); EmitVectorSseOrSse2OpF(context, nameof(Sse.Sqrt));
} }
@ -1785,8 +1770,7 @@ namespace ChocolArm64.Instructions
public static void Fsub_S(ILEmitterCtx context) public static void Fsub_S(ILEmitterCtx context)
{ {
if (Optimizations.FastFP && Optimizations.UseSse if (Optimizations.FastFP && Optimizations.UseSse2)
&& Optimizations.UseSse2)
{ {
EmitScalarSseOrSse2OpF(context, nameof(Sse.SubtractScalar)); EmitScalarSseOrSse2OpF(context, nameof(Sse.SubtractScalar));
} }
@ -1801,8 +1785,7 @@ namespace ChocolArm64.Instructions
public static void Fsub_V(ILEmitterCtx context) public static void Fsub_V(ILEmitterCtx context)
{ {
if (Optimizations.FastFP && Optimizations.UseSse if (Optimizations.FastFP && Optimizations.UseSse2)
&& Optimizations.UseSse2)
{ {
EmitVectorSseOrSse2OpF(context, nameof(Sse.Subtract)); EmitVectorSseOrSse2OpF(context, nameof(Sse.Subtract));
} }
@ -2268,6 +2251,15 @@ namespace ChocolArm64.Instructions
} }
} }
public static void Smlal_Ve(ILEmitterCtx context)
{
EmitVectorWidenTernaryOpByElemSx(context, () =>
{
context.Emit(OpCodes.Mul);
context.Emit(OpCodes.Add);
});
}
public static void Smlsl_V(ILEmitterCtx context) public static void Smlsl_V(ILEmitterCtx context)
{ {
OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp; OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp;
@ -2319,11 +2311,25 @@ namespace ChocolArm64.Instructions
} }
} }
public static void Smlsl_Ve(ILEmitterCtx context)
{
EmitVectorWidenTernaryOpByElemSx(context, () =>
{
context.Emit(OpCodes.Mul);
context.Emit(OpCodes.Sub);
});
}
public static void Smull_V(ILEmitterCtx context) public static void Smull_V(ILEmitterCtx context)
{ {
EmitVectorWidenRnRmBinaryOpSx(context, () => context.Emit(OpCodes.Mul)); EmitVectorWidenRnRmBinaryOpSx(context, () => context.Emit(OpCodes.Mul));
} }
public static void Smull_Ve(ILEmitterCtx context)
{
EmitVectorWidenBinaryOpByElemSx(context, () => context.Emit(OpCodes.Mul));
}
public static void Sqabs_S(ILEmitterCtx context) public static void Sqabs_S(ILEmitterCtx context)
{ {
EmitScalarSaturatingUnaryOpSx(context, () => EmitAbs(context)); EmitScalarSaturatingUnaryOpSx(context, () => EmitAbs(context));
@ -2929,6 +2935,15 @@ namespace ChocolArm64.Instructions
} }
} }
public static void Umlal_Ve(ILEmitterCtx context)
{
EmitVectorWidenTernaryOpByElemZx(context, () =>
{
context.Emit(OpCodes.Mul);
context.Emit(OpCodes.Add);
});
}
public static void Umlsl_V(ILEmitterCtx context) public static void Umlsl_V(ILEmitterCtx context)
{ {
OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp; OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp;
@ -2980,11 +2995,25 @@ namespace ChocolArm64.Instructions
} }
} }
public static void Umlsl_Ve(ILEmitterCtx context)
{
EmitVectorWidenTernaryOpByElemZx(context, () =>
{
context.Emit(OpCodes.Mul);
context.Emit(OpCodes.Sub);
});
}
public static void Umull_V(ILEmitterCtx context) public static void Umull_V(ILEmitterCtx context)
{ {
EmitVectorWidenRnRmBinaryOpZx(context, () => context.Emit(OpCodes.Mul)); EmitVectorWidenRnRmBinaryOpZx(context, () => context.Emit(OpCodes.Mul));
} }
public static void Umull_Ve(ILEmitterCtx context)
{
EmitVectorWidenBinaryOpByElemZx(context, () => context.Emit(OpCodes.Mul));
}
public static void Uqadd_S(ILEmitterCtx context) public static void Uqadd_S(ILEmitterCtx context)
{ {
EmitScalarSaturatingBinaryOpZx(context, SaturatingFlags.Add); EmitScalarSaturatingBinaryOpZx(context, SaturatingFlags.Add);

View file

@ -173,8 +173,7 @@ namespace ChocolArm64.Instructions
public static void Fcmeq_S(ILEmitterCtx context) public static void Fcmeq_S(ILEmitterCtx context)
{ {
if (Optimizations.FastFP && Optimizations.UseSse if (Optimizations.FastFP && Optimizations.UseSse2)
&& Optimizations.UseSse2)
{ {
EmitCmpSseOrSse2OpF(context, nameof(Sse.CompareEqualScalar), scalar: true); EmitCmpSseOrSse2OpF(context, nameof(Sse.CompareEqualScalar), scalar: true);
} }
@ -186,8 +185,7 @@ namespace ChocolArm64.Instructions
public static void Fcmeq_V(ILEmitterCtx context) public static void Fcmeq_V(ILEmitterCtx context)
{ {
if (Optimizations.FastFP && Optimizations.UseSse if (Optimizations.FastFP && Optimizations.UseSse2)
&& Optimizations.UseSse2)
{ {
EmitCmpSseOrSse2OpF(context, nameof(Sse.CompareEqual), scalar: false); EmitCmpSseOrSse2OpF(context, nameof(Sse.CompareEqual), scalar: false);
} }
@ -199,8 +197,7 @@ namespace ChocolArm64.Instructions
public static void Fcmge_S(ILEmitterCtx context) public static void Fcmge_S(ILEmitterCtx context)
{ {
if (Optimizations.FastFP && Optimizations.UseSse if (Optimizations.FastFP && Optimizations.UseSse2)
&& Optimizations.UseSse2)
{ {
EmitCmpSseOrSse2OpF(context, nameof(Sse.CompareGreaterThanOrEqualScalar), scalar: true); EmitCmpSseOrSse2OpF(context, nameof(Sse.CompareGreaterThanOrEqualScalar), scalar: true);
} }
@ -212,8 +209,7 @@ namespace ChocolArm64.Instructions
public static void Fcmge_V(ILEmitterCtx context) public static void Fcmge_V(ILEmitterCtx context)
{ {
if (Optimizations.FastFP && Optimizations.UseSse if (Optimizations.FastFP && Optimizations.UseSse2)
&& Optimizations.UseSse2)
{ {
EmitCmpSseOrSse2OpF(context, nameof(Sse.CompareGreaterThanOrEqual), scalar: false); EmitCmpSseOrSse2OpF(context, nameof(Sse.CompareGreaterThanOrEqual), scalar: false);
} }
@ -225,8 +221,7 @@ namespace ChocolArm64.Instructions
public static void Fcmgt_S(ILEmitterCtx context) public static void Fcmgt_S(ILEmitterCtx context)
{ {
if (Optimizations.FastFP && Optimizations.UseSse if (Optimizations.FastFP && Optimizations.UseSse2)
&& Optimizations.UseSse2)
{ {
EmitCmpSseOrSse2OpF(context, nameof(Sse.CompareGreaterThanScalar), scalar: true); EmitCmpSseOrSse2OpF(context, nameof(Sse.CompareGreaterThanScalar), scalar: true);
} }
@ -238,8 +233,7 @@ namespace ChocolArm64.Instructions
public static void Fcmgt_V(ILEmitterCtx context) public static void Fcmgt_V(ILEmitterCtx context)
{ {
if (Optimizations.FastFP && Optimizations.UseSse if (Optimizations.FastFP && Optimizations.UseSse2)
&& Optimizations.UseSse2)
{ {
EmitCmpSseOrSse2OpF(context, nameof(Sse.CompareGreaterThan), scalar: false); EmitCmpSseOrSse2OpF(context, nameof(Sse.CompareGreaterThan), scalar: false);
} }
@ -251,8 +245,7 @@ namespace ChocolArm64.Instructions
public static void Fcmle_S(ILEmitterCtx context) public static void Fcmle_S(ILEmitterCtx context)
{ {
if (Optimizations.FastFP && Optimizations.UseSse if (Optimizations.FastFP && Optimizations.UseSse2)
&& Optimizations.UseSse2)
{ {
EmitCmpSseOrSse2OpF(context, nameof(Sse.CompareGreaterThanOrEqualScalar), scalar: true, isLeOrLt: true); EmitCmpSseOrSse2OpF(context, nameof(Sse.CompareGreaterThanOrEqualScalar), scalar: true, isLeOrLt: true);
} }
@ -264,8 +257,7 @@ namespace ChocolArm64.Instructions
public static void Fcmle_V(ILEmitterCtx context) public static void Fcmle_V(ILEmitterCtx context)
{ {
if (Optimizations.FastFP && Optimizations.UseSse if (Optimizations.FastFP && Optimizations.UseSse2)
&& Optimizations.UseSse2)
{ {
EmitCmpSseOrSse2OpF(context, nameof(Sse.CompareGreaterThanOrEqual), scalar: false, isLeOrLt: true); EmitCmpSseOrSse2OpF(context, nameof(Sse.CompareGreaterThanOrEqual), scalar: false, isLeOrLt: true);
} }
@ -277,8 +269,7 @@ namespace ChocolArm64.Instructions
public static void Fcmlt_S(ILEmitterCtx context) public static void Fcmlt_S(ILEmitterCtx context)
{ {
if (Optimizations.FastFP && Optimizations.UseSse if (Optimizations.FastFP && Optimizations.UseSse2)
&& Optimizations.UseSse2)
{ {
EmitCmpSseOrSse2OpF(context, nameof(Sse.CompareGreaterThanScalar), scalar: true, isLeOrLt: true); EmitCmpSseOrSse2OpF(context, nameof(Sse.CompareGreaterThanScalar), scalar: true, isLeOrLt: true);
} }
@ -290,8 +281,7 @@ namespace ChocolArm64.Instructions
public static void Fcmlt_V(ILEmitterCtx context) public static void Fcmlt_V(ILEmitterCtx context)
{ {
if (Optimizations.FastFP && Optimizations.UseSse if (Optimizations.FastFP && Optimizations.UseSse2)
&& Optimizations.UseSse2)
{ {
EmitCmpSseOrSse2OpF(context, nameof(Sse.CompareGreaterThan), scalar: false, isLeOrLt: true); EmitCmpSseOrSse2OpF(context, nameof(Sse.CompareGreaterThan), scalar: false, isLeOrLt: true);
} }

View file

@ -78,7 +78,6 @@ namespace ChocolArm64.Instructions
if (Optimizations.UseSse2 && sizeF == 1) if (Optimizations.UseSse2 && sizeF == 1)
{ {
Type[] typesMov = new Type[] { typeof(Vector128<float>), typeof(Vector128<float>) };
Type[] typesCvt = new Type[] { typeof(Vector128<float>) }; Type[] typesCvt = new Type[] { typeof(Vector128<float>) };
string nameMov = op.RegisterSize == RegisterSize.Simd128 string nameMov = op.RegisterSize == RegisterSize.Simd128
@ -88,7 +87,7 @@ namespace ChocolArm64.Instructions
context.EmitLdvec(op.Rn); context.EmitLdvec(op.Rn);
context.Emit(OpCodes.Dup); context.Emit(OpCodes.Dup);
context.EmitCall(typeof(Sse).GetMethod(nameMov, typesMov)); context.EmitCall(typeof(Sse).GetMethod(nameMov));
context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ConvertToVector128Double), typesCvt)); context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ConvertToVector128Double), typesCvt));
@ -144,7 +143,6 @@ namespace ChocolArm64.Instructions
if (Optimizations.UseSse2 && sizeF == 1) if (Optimizations.UseSse2 && sizeF == 1)
{ {
Type[] typesMov = new Type[] { typeof(Vector128<float>), typeof(Vector128<float>) };
Type[] typesCvt = new Type[] { typeof(Vector128<double>) }; Type[] typesCvt = new Type[] { typeof(Vector128<double>) };
string nameMov = op.RegisterSize == RegisterSize.Simd128 string nameMov = op.RegisterSize == RegisterSize.Simd128
@ -154,15 +152,15 @@ namespace ChocolArm64.Instructions
context.EmitLdvec(op.Rd); context.EmitLdvec(op.Rd);
VectorHelper.EmitCall(context, nameof(VectorHelper.VectorSingleZero)); VectorHelper.EmitCall(context, nameof(VectorHelper.VectorSingleZero));
context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.MoveLowToHigh), typesMov)); context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.MoveLowToHigh)));
EmitLdvecWithCastToDouble(context, op.Rn); EmitLdvecWithCastToDouble(context, op.Rn);
context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ConvertToVector128Single), typesCvt)); context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ConvertToVector128Single), typesCvt));
context.Emit(OpCodes.Dup); context.Emit(OpCodes.Dup);
context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.MoveLowToHigh), typesMov)); context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.MoveLowToHigh)));
context.EmitCall(typeof(Sse).GetMethod(nameMov, typesMov)); context.EmitCall(typeof(Sse).GetMethod(nameMov));
context.EmitStvec(op.Rd); context.EmitStvec(op.Rd);
} }

View file

@ -642,21 +642,21 @@ namespace ChocolArm64.Instructions
{ {
OpCodeSimdRegElem64 op = (OpCodeSimdRegElem64)context.CurrOp; OpCodeSimdRegElem64 op = (OpCodeSimdRegElem64)context.CurrOp;
EmitVectorOpByElem(context, emit, op.Index, false, true); EmitVectorOpByElem(context, emit, op.Index, ternary: false, signed: true);
} }
public static void EmitVectorBinaryOpByElemZx(ILEmitterCtx context, Action emit) public static void EmitVectorBinaryOpByElemZx(ILEmitterCtx context, Action emit)
{ {
OpCodeSimdRegElem64 op = (OpCodeSimdRegElem64)context.CurrOp; OpCodeSimdRegElem64 op = (OpCodeSimdRegElem64)context.CurrOp;
EmitVectorOpByElem(context, emit, op.Index, false, false); EmitVectorOpByElem(context, emit, op.Index, ternary: false, signed: false);
} }
public static void EmitVectorTernaryOpByElemZx(ILEmitterCtx context, Action emit) public static void EmitVectorTernaryOpByElemZx(ILEmitterCtx context, Action emit)
{ {
OpCodeSimdRegElem64 op = (OpCodeSimdRegElem64)context.CurrOp; OpCodeSimdRegElem64 op = (OpCodeSimdRegElem64)context.CurrOp;
EmitVectorOpByElem(context, emit, op.Index, true, false); EmitVectorOpByElem(context, emit, op.Index, ternary: true, signed: false);
} }
public static void EmitVectorOpByElem(ILEmitterCtx context, Action emit, int elem, bool ternary, bool signed) public static void EmitVectorOpByElem(ILEmitterCtx context, Action emit, int elem, bool ternary, bool signed)
@ -809,6 +809,64 @@ namespace ChocolArm64.Instructions
context.EmitStvec(op.Rd); context.EmitStvec(op.Rd);
} }
public static void EmitVectorWidenBinaryOpByElemSx(ILEmitterCtx context, Action emit)
{
OpCodeSimdRegElem64 op = (OpCodeSimdRegElem64)context.CurrOp;
EmitVectorWidenOpByElem(context, emit, op.Index, ternary: false, signed: true);
}
public static void EmitVectorWidenBinaryOpByElemZx(ILEmitterCtx context, Action emit)
{
OpCodeSimdRegElem64 op = (OpCodeSimdRegElem64)context.CurrOp;
EmitVectorWidenOpByElem(context, emit, op.Index, ternary: false, signed: false);
}
public static void EmitVectorWidenTernaryOpByElemSx(ILEmitterCtx context, Action emit)
{
OpCodeSimdRegElem64 op = (OpCodeSimdRegElem64)context.CurrOp;
EmitVectorWidenOpByElem(context, emit, op.Index, ternary: true, signed: true);
}
public static void EmitVectorWidenTernaryOpByElemZx(ILEmitterCtx context, Action emit)
{
OpCodeSimdRegElem64 op = (OpCodeSimdRegElem64)context.CurrOp;
EmitVectorWidenOpByElem(context, emit, op.Index, ternary: true, signed: false);
}
public static void EmitVectorWidenOpByElem(ILEmitterCtx context, Action emit, int elem, bool ternary, bool signed)
{
OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp;
int elems = 8 >> op.Size;
int part = op.RegisterSize == RegisterSize.Simd128 ? elems : 0;
EmitVectorExtract(context, op.Rm, elem, op.Size, signed);
context.EmitSttmp();
for (int index = 0; index < elems; index++)
{
if (ternary)
{
EmitVectorExtract(context, op.Rd, index, op.Size + 1, signed);
}
EmitVectorExtract(context, op.Rn, part + index, op.Size, signed);
context.EmitLdtmp();
emit();
EmitVectorInsertTmp(context, index, op.Size + 1);
}
context.EmitLdvectmp();
context.EmitStvec(op.Rd);
}
public static void EmitVectorPairwiseOpSx(ILEmitterCtx context, Action emit) public static void EmitVectorPairwiseOpSx(ILEmitterCtx context, Action emit)
{ {
EmitVectorPairwiseOp(context, emit, true); EmitVectorPairwiseOp(context, emit, true);
@ -1416,7 +1474,7 @@ namespace ChocolArm64.Instructions
if (Optimizations.UseSse) if (Optimizations.UseSse)
{ {
//TODO: Use Sse2.MoveScalar once it is fixed, //TODO: Use Sse2.MoveScalar once it is fixed,
//as of the time of writing it just crashes the JIT (SDK 2.1.500). //as of the time of writing it just crashes the JIT (SDK 2.1.503).
/*Type[] typesMov = new Type[] { typeof(Vector128<ulong>) }; /*Type[] typesMov = new Type[] { typeof(Vector128<ulong>) };

View file

@ -12,6 +12,34 @@ namespace ChocolArm64.Instructions
{ {
static partial class InstEmit static partial class InstEmit
{ {
#region "Masks"
private static readonly long[] _masksE0_TrnUzpXtn = new long[]
{
14L << 56 | 12L << 48 | 10L << 40 | 08L << 32 | 06L << 24 | 04L << 16 | 02L << 8 | 00L << 0,
13L << 56 | 12L << 48 | 09L << 40 | 08L << 32 | 05L << 24 | 04L << 16 | 01L << 8 | 00L << 0,
11L << 56 | 10L << 48 | 09L << 40 | 08L << 32 | 03L << 24 | 02L << 16 | 01L << 8 | 00L << 0
};
private static readonly long[] _masksE1_TrnUzp = new long[]
{
15L << 56 | 13L << 48 | 11L << 40 | 09L << 32 | 07L << 24 | 05L << 16 | 03L << 8 | 01L << 0,
15L << 56 | 14L << 48 | 11L << 40 | 10L << 32 | 07L << 24 | 06L << 16 | 03L << 8 | 02L << 0,
15L << 56 | 14L << 48 | 13L << 40 | 12L << 32 | 07L << 24 | 06L << 16 | 05L << 8 | 04L << 0
};
private static readonly long[] _masksE0_Uzp = new long[]
{
13L << 56 | 09L << 48 | 05L << 40 | 01L << 32 | 12L << 24 | 08L << 16 | 04L << 8 | 00L << 0,
11L << 56 | 10L << 48 | 03L << 40 | 02L << 32 | 09L << 24 | 08L << 16 | 01L << 8 | 00L << 0
};
private static readonly long[] _masksE1_Uzp = new long[]
{
15L << 56 | 11L << 48 | 07L << 40 | 03L << 32 | 14L << 24 | 10L << 16 | 06L << 8 | 02L << 0,
15L << 56 | 14L << 48 | 07L << 40 | 06L << 32 | 13L << 24 | 12L << 16 | 05L << 8 | 04L << 0
};
#endregion
public static void Dup_Gp(ILEmitterCtx context) public static void Dup_Gp(ILEmitterCtx context)
{ {
OpCodeSimdIns64 op = (OpCodeSimdIns64)context.CurrOp; OpCodeSimdIns64 op = (OpCodeSimdIns64)context.CurrOp;
@ -379,15 +407,6 @@ namespace ChocolArm64.Instructions
if (Optimizations.UseSsse3) if (Optimizations.UseSsse3)
{ {
long[] masks = new long[]
{
14L << 56 | 12L << 48 | 10L << 40 | 08L << 32 | 06L << 24 | 04L << 16 | 02L << 8 | 00L << 0,
13L << 56 | 12L << 48 | 09L << 40 | 08L << 32 | 05L << 24 | 04L << 16 | 01L << 8 | 00L << 0,
11L << 56 | 10L << 48 | 09L << 40 | 08L << 32 | 03L << 24 | 02L << 16 | 01L << 8 | 00L << 0
};
Type[] typesMov = new Type[] { typeof(Vector128<float>), typeof(Vector128<float>) };
Type[] typesSfl = new Type[] { typeof(Vector128<sbyte>), typeof(Vector128<sbyte>) };
Type[] typesSve = new Type[] { typeof(long), typeof(long) }; Type[] typesSve = new Type[] { typeof(long), typeof(long) };
string nameMov = op.RegisterSize == RegisterSize.Simd128 string nameMov = op.RegisterSize == RegisterSize.Simd128
@ -397,18 +416,18 @@ namespace ChocolArm64.Instructions
context.EmitLdvec(op.Rd); context.EmitLdvec(op.Rd);
VectorHelper.EmitCall(context, nameof(VectorHelper.VectorSingleZero)); VectorHelper.EmitCall(context, nameof(VectorHelper.VectorSingleZero));
context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.MoveLowToHigh), typesMov)); context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.MoveLowToHigh)));
EmitLdvecWithSignedCast(context, op.Rn, 0); EmitLdvecWithSignedCast(context, op.Rn, 0); // value
context.EmitLdc_I8(masks[op.Size]); context.EmitLdc_I8(_masksE0_TrnUzpXtn[op.Size]); // mask
context.Emit(OpCodes.Dup); context.Emit(OpCodes.Dup); // mask
context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.SetVector128), typesSve)); context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.SetVector128), typesSve));
context.EmitCall(typeof(Ssse3).GetMethod(nameof(Ssse3.Shuffle), typesSfl)); context.EmitCall(typeof(Ssse3).GetMethod(nameof(Ssse3.Shuffle), GetTypesSflUpk(0)));
context.EmitCall(typeof(Sse).GetMethod(nameMov, typesMov)); context.EmitCall(typeof(Sse).GetMethod(nameMov));
context.EmitStvec(op.Rd); context.EmitStvec(op.Rd);
} }
@ -465,22 +484,61 @@ namespace ChocolArm64.Instructions
{ {
OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp; OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp;
int words = op.GetBitsCount() >> 4; if (Optimizations.UseSsse3)
int pairs = words >> op.Size;
for (int index = 0; index < pairs; index++)
{ {
int idx = index << 1; Type[] typesSve = new Type[] { typeof(long), typeof(long) };
EmitVectorExtractZx(context, op.Rn, idx + part, op.Size); string nameUpk = part == 0
EmitVectorExtractZx(context, op.Rm, idx + part, op.Size); ? nameof(Sse2.UnpackLow)
: nameof(Sse2.UnpackHigh);
EmitVectorInsertTmp(context, idx + 1, op.Size); EmitLdvecWithSignedCast(context, op.Rn, op.Size); // value
EmitVectorInsertTmp(context, idx, op.Size);
if (op.Size < 3)
{
context.EmitLdc_I8(_masksE1_TrnUzp [op.Size]); // maskE1
context.EmitLdc_I8(_masksE0_TrnUzpXtn[op.Size]); // maskE0
context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.SetVector128), typesSve));
context.EmitCall(typeof(Ssse3).GetMethod(nameof(Ssse3.Shuffle), GetTypesSflUpk(0)));
}
EmitLdvecWithSignedCast(context, op.Rm, op.Size); // value
if (op.Size < 3)
{
context.EmitLdc_I8(_masksE1_TrnUzp [op.Size]); // maskE1
context.EmitLdc_I8(_masksE0_TrnUzpXtn[op.Size]); // maskE0
context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.SetVector128), typesSve));
context.EmitCall(typeof(Ssse3).GetMethod(nameof(Ssse3.Shuffle), GetTypesSflUpk(0)));
}
context.EmitCall(typeof(Sse2).GetMethod(nameUpk, GetTypesSflUpk(op.Size)));
EmitStvecWithSignedCast(context, op.Rd, op.Size);
} }
else
{
int words = op.GetBitsCount() >> 4;
int pairs = words >> op.Size;
context.EmitLdvectmp(); for (int index = 0; index < pairs; index++)
context.EmitStvec(op.Rd); {
int idx = index << 1;
EmitVectorExtractZx(context, op.Rn, idx + part, op.Size);
EmitVectorExtractZx(context, op.Rm, idx + part, op.Size);
EmitVectorInsertTmp(context, idx + 1, op.Size);
EmitVectorInsertTmp(context, idx, op.Size);
}
context.EmitLdvectmp();
context.EmitStvec(op.Rd);
}
if (op.RegisterSize == RegisterSize.Simd64) if (op.RegisterSize == RegisterSize.Simd64)
{ {
@ -492,26 +550,91 @@ namespace ChocolArm64.Instructions
{ {
OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp; OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp;
int words = op.GetBitsCount() >> 4; if (Optimizations.UseSsse3)
int pairs = words >> op.Size;
for (int index = 0; index < pairs; index++)
{ {
int idx = index << 1; Type[] typesSve = new Type[] { typeof(long), typeof(long) };
EmitVectorExtractZx(context, op.Rn, idx + part, op.Size); string nameUpk = part == 0
EmitVectorExtractZx(context, op.Rm, idx + part, op.Size); ? nameof(Sse2.UnpackLow)
: nameof(Sse2.UnpackHigh);
EmitVectorInsertTmp(context, pairs + index, op.Size); if (op.RegisterSize == RegisterSize.Simd128)
EmitVectorInsertTmp(context, index, op.Size); {
EmitLdvecWithSignedCast(context, op.Rn, op.Size); // value
if (op.Size < 3)
{
context.EmitLdc_I8(_masksE1_TrnUzp [op.Size]); // maskE1
context.EmitLdc_I8(_masksE0_TrnUzpXtn[op.Size]); // maskE0
context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.SetVector128), typesSve));
context.EmitCall(typeof(Ssse3).GetMethod(nameof(Ssse3.Shuffle), GetTypesSflUpk(0)));
}
EmitLdvecWithSignedCast(context, op.Rm, op.Size); // value
if (op.Size < 3)
{
context.EmitLdc_I8(_masksE1_TrnUzp [op.Size]); // maskE1
context.EmitLdc_I8(_masksE0_TrnUzpXtn[op.Size]); // maskE0
context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.SetVector128), typesSve));
context.EmitCall(typeof(Ssse3).GetMethod(nameof(Ssse3.Shuffle), GetTypesSflUpk(0)));
}
context.EmitCall(typeof(Sse2).GetMethod(nameUpk, GetTypesSflUpk(3)));
EmitStvecWithSignedCast(context, op.Rd, op.Size);
}
else
{
EmitLdvecWithSignedCast(context, op.Rn, op.Size);
EmitLdvecWithSignedCast(context, op.Rm, op.Size);
context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.UnpackLow), GetTypesSflUpk(op.Size))); // value
if (op.Size < 2)
{
context.EmitLdc_I8(_masksE1_Uzp[op.Size]); // maskE1
context.EmitLdc_I8(_masksE0_Uzp[op.Size]); // maskE0
context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.SetVector128), typesSve));
context.EmitCall(typeof(Ssse3).GetMethod(nameof(Ssse3.Shuffle), GetTypesSflUpk(0)));
}
VectorHelper.EmitCall(context, nameof(VectorHelper.VectorInt64Zero));
context.EmitCall(typeof(Sse2).GetMethod(nameUpk, GetTypesSflUpk(3)));
EmitStvecWithSignedCast(context, op.Rd, op.Size);
}
} }
else
context.EmitLdvectmp();
context.EmitStvec(op.Rd);
if (op.RegisterSize == RegisterSize.Simd64)
{ {
EmitVectorZeroUpper(context, op.Rd); int words = op.GetBitsCount() >> 4;
int pairs = words >> op.Size;
for (int index = 0; index < pairs; index++)
{
int idx = index << 1;
EmitVectorExtractZx(context, op.Rn, idx + part, op.Size);
EmitVectorExtractZx(context, op.Rm, idx + part, op.Size);
EmitVectorInsertTmp(context, pairs + index, op.Size);
EmitVectorInsertTmp(context, index, op.Size);
}
context.EmitLdvectmp();
context.EmitStvec(op.Rd);
if (op.RegisterSize == RegisterSize.Simd64)
{
EmitVectorZeroUpper(context, op.Rd);
}
} }
} }
@ -521,36 +644,26 @@ namespace ChocolArm64.Instructions
if (Optimizations.UseSse2) if (Optimizations.UseSse2)
{ {
EmitLdvecWithUnsignedCast(context, op.Rn, op.Size); string nameUpk = part == 0
EmitLdvecWithUnsignedCast(context, op.Rm, op.Size);
Type[] types = new Type[]
{
VectorUIntTypesPerSizeLog2[op.Size],
VectorUIntTypesPerSizeLog2[op.Size]
};
string name = part == 0 || (part != 0 && op.RegisterSize == RegisterSize.Simd64)
? nameof(Sse2.UnpackLow) ? nameof(Sse2.UnpackLow)
: nameof(Sse2.UnpackHigh); : nameof(Sse2.UnpackHigh);
context.EmitCall(typeof(Sse2).GetMethod(name, types)); EmitLdvecWithSignedCast(context, op.Rn, op.Size);
EmitLdvecWithSignedCast(context, op.Rm, op.Size);
if (op.RegisterSize == RegisterSize.Simd64 && part != 0) if (op.RegisterSize == RegisterSize.Simd128)
{ {
context.EmitLdc_I4(8); context.EmitCall(typeof(Sse2).GetMethod(nameUpk, GetTypesSflUpk(op.Size)));
}
else
{
context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.UnpackLow), GetTypesSflUpk(op.Size)));
VectorHelper.EmitCall(context, nameof(VectorHelper.VectorInt64Zero));
Type[] shTypes = new Type[] { VectorUIntTypesPerSizeLog2[op.Size], typeof(byte) }; context.EmitCall(typeof(Sse2).GetMethod(nameUpk, GetTypesSflUpk(3)));
context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical128BitLane), shTypes));
} }
EmitStvecWithUnsignedCast(context, op.Rd, op.Size); EmitStvecWithSignedCast(context, op.Rd, op.Size);
if (op.RegisterSize == RegisterSize.Simd64 && part == 0)
{
EmitVectorZeroUpper(context, op.Rd);
}
} }
else else
{ {
@ -579,5 +692,10 @@ namespace ChocolArm64.Instructions
} }
} }
} }
private static Type[] GetTypesSflUpk(int size)
{
return new Type[] { VectorIntTypesPerSizeLog2[size], VectorIntTypesPerSizeLog2[size] };
}
} }
} }

View file

@ -664,7 +664,7 @@ namespace ChocolArm64.Instructions
for (int bit = highBit; bit >= 0; bit--) for (int bit = highBit; bit >= 0; bit--)
{ {
if (((value >> bit) & 0b1) != 0) if (((int)(value >> bit) & 0b1) != 0)
{ {
return (ulong)(highBit - bit); return (ulong)(highBit - bit);
} }
@ -688,7 +688,7 @@ namespace ChocolArm64.Instructions
do do
{ {
nibbleIdx -= 4; nibbleIdx -= 4;
preCount = ClzNibbleTbl[(value >> nibbleIdx) & 0b1111]; preCount = ClzNibbleTbl[(int)(value >> nibbleIdx) & 0b1111];
count += preCount; count += preCount;
} }
while (preCount == 4); while (preCount == 4);
@ -698,11 +698,6 @@ namespace ChocolArm64.Instructions
public static ulong CountSetBits8(ulong value) // "size" is 8 (SIMD&FP Inst.). public static ulong CountSetBits8(ulong value) // "size" is 8 (SIMD&FP Inst.).
{ {
if (value == 0xfful)
{
return 8ul;
}
value = ((value >> 1) & 0x55ul) + (value & 0x55ul); value = ((value >> 1) & 0x55ul) + (value & 0x55ul);
value = ((value >> 2) & 0x33ul) + (value & 0x33ul); value = ((value >> 2) & 0x33ul) + (value & 0x33ul);

View file

@ -1545,9 +1545,9 @@ namespace ChocolArm64.Instructions
return -value; return -value;
} }
private static float ZerosOrOnes(bool zeros) private static float ZerosOrOnes(bool ones)
{ {
return BitConverter.Int32BitsToSingle(!zeros ? 0 : -1); return BitConverter.Int32BitsToSingle(ones ? -1 : 0);
} }
private static float FPUnpack( private static float FPUnpack(
@ -2629,9 +2629,9 @@ namespace ChocolArm64.Instructions
return -value; return -value;
} }
private static double ZerosOrOnes(bool zeros) private static double ZerosOrOnes(bool ones)
{ {
return BitConverter.Int64BitsToDouble(!zeros ? 0L : -1L); return BitConverter.Int64BitsToDouble(ones ? -1L : 0L);
} }
private static double FPUnpack( private static double FPUnpack(

View file

@ -445,9 +445,12 @@ namespace ChocolArm64
SetA64("0x001110<<1xxxxx011011xxxxxxxxxx", InstEmit.Smin_V, typeof(OpCodeSimdReg64)); SetA64("0x001110<<1xxxxx011011xxxxxxxxxx", InstEmit.Smin_V, typeof(OpCodeSimdReg64));
SetA64("0x001110<<1xxxxx101011xxxxxxxxxx", InstEmit.Sminp_V, typeof(OpCodeSimdReg64)); SetA64("0x001110<<1xxxxx101011xxxxxxxxxx", InstEmit.Sminp_V, typeof(OpCodeSimdReg64));
SetA64("0x001110<<1xxxxx100000xxxxxxxxxx", InstEmit.Smlal_V, typeof(OpCodeSimdReg64)); SetA64("0x001110<<1xxxxx100000xxxxxxxxxx", InstEmit.Smlal_V, typeof(OpCodeSimdReg64));
SetA64("0x001111xxxxxxxx0010x0xxxxxxxxxx", InstEmit.Smlal_Ve, typeof(OpCodeSimdRegElem64));
SetA64("0x001110<<1xxxxx101000xxxxxxxxxx", InstEmit.Smlsl_V, typeof(OpCodeSimdReg64)); SetA64("0x001110<<1xxxxx101000xxxxxxxxxx", InstEmit.Smlsl_V, typeof(OpCodeSimdReg64));
SetA64("0x001111xxxxxxxx0110x0xxxxxxxxxx", InstEmit.Smlsl_Ve, typeof(OpCodeSimdRegElem64));
SetA64("0x001110000xxxxx001011xxxxxxxxxx", InstEmit.Smov_S, typeof(OpCodeSimdIns64)); SetA64("0x001110000xxxxx001011xxxxxxxxxx", InstEmit.Smov_S, typeof(OpCodeSimdIns64));
SetA64("0x001110<<1xxxxx110000xxxxxxxxxx", InstEmit.Smull_V, typeof(OpCodeSimdReg64)); SetA64("0x001110<<1xxxxx110000xxxxxxxxxx", InstEmit.Smull_V, typeof(OpCodeSimdReg64));
SetA64("0x001111xxxxxxxx1010x0xxxxxxxxxx", InstEmit.Smull_Ve, typeof(OpCodeSimdRegElem64));
SetA64("01011110xx100000011110xxxxxxxxxx", InstEmit.Sqabs_S, typeof(OpCodeSimd64)); SetA64("01011110xx100000011110xxxxxxxxxx", InstEmit.Sqabs_S, typeof(OpCodeSimd64));
SetA64("0>001110<<100000011110xxxxxxxxxx", InstEmit.Sqabs_V, typeof(OpCodeSimd64)); SetA64("0>001110<<100000011110xxxxxxxxxx", InstEmit.Sqabs_V, typeof(OpCodeSimd64));
SetA64("01011110xx1xxxxx000011xxxxxxxxxx", InstEmit.Sqadd_S, typeof(OpCodeSimdReg64)); SetA64("01011110xx1xxxxx000011xxxxxxxxxx", InstEmit.Sqadd_S, typeof(OpCodeSimdReg64));
@ -534,9 +537,12 @@ namespace ChocolArm64
SetA64("0x101110<<1xxxxx011011xxxxxxxxxx", InstEmit.Umin_V, typeof(OpCodeSimdReg64)); SetA64("0x101110<<1xxxxx011011xxxxxxxxxx", InstEmit.Umin_V, typeof(OpCodeSimdReg64));
SetA64("0x101110<<1xxxxx101011xxxxxxxxxx", InstEmit.Uminp_V, typeof(OpCodeSimdReg64)); SetA64("0x101110<<1xxxxx101011xxxxxxxxxx", InstEmit.Uminp_V, typeof(OpCodeSimdReg64));
SetA64("0x101110<<1xxxxx100000xxxxxxxxxx", InstEmit.Umlal_V, typeof(OpCodeSimdReg64)); SetA64("0x101110<<1xxxxx100000xxxxxxxxxx", InstEmit.Umlal_V, typeof(OpCodeSimdReg64));
SetA64("0x101111xxxxxxxx0010x0xxxxxxxxxx", InstEmit.Umlal_Ve, typeof(OpCodeSimdRegElem64));
SetA64("0x101110<<1xxxxx101000xxxxxxxxxx", InstEmit.Umlsl_V, typeof(OpCodeSimdReg64)); SetA64("0x101110<<1xxxxx101000xxxxxxxxxx", InstEmit.Umlsl_V, typeof(OpCodeSimdReg64));
SetA64("0x101111xxxxxxxx0110x0xxxxxxxxxx", InstEmit.Umlsl_Ve, typeof(OpCodeSimdRegElem64));
SetA64("0x001110000xxxxx001111xxxxxxxxxx", InstEmit.Umov_S, typeof(OpCodeSimdIns64)); SetA64("0x001110000xxxxx001111xxxxxxxxxx", InstEmit.Umov_S, typeof(OpCodeSimdIns64));
SetA64("0x101110<<1xxxxx110000xxxxxxxxxx", InstEmit.Umull_V, typeof(OpCodeSimdReg64)); SetA64("0x101110<<1xxxxx110000xxxxxxxxxx", InstEmit.Umull_V, typeof(OpCodeSimdReg64));
SetA64("0x101111xxxxxxxx1010x0xxxxxxxxxx", InstEmit.Umull_Ve, typeof(OpCodeSimdRegElem64));
SetA64("01111110xx1xxxxx000011xxxxxxxxxx", InstEmit.Uqadd_S, typeof(OpCodeSimdReg64)); SetA64("01111110xx1xxxxx000011xxxxxxxxxx", InstEmit.Uqadd_S, typeof(OpCodeSimdReg64));
SetA64("0>101110<<1xxxxx000011xxxxxxxxxx", InstEmit.Uqadd_V, typeof(OpCodeSimdReg64)); SetA64("0>101110<<1xxxxx000011xxxxxxxxxx", InstEmit.Uqadd_V, typeof(OpCodeSimdReg64));
SetA64("0>101110<<1xxxxx010111xxxxxxxxxx", InstEmit.Uqrshl_V, typeof(OpCodeSimdReg64)); SetA64("0>101110<<1xxxxx010111xxxxxxxxxx", InstEmit.Uqrshl_V, typeof(OpCodeSimdReg64));

View file

@ -45,6 +45,32 @@ namespace Ryujinx.Tests.Cpu
0x0F808000u // MUL V0.2S, V0.2S, V0.S[0] 0x0F808000u // MUL V0.2S, V0.2S, V0.S[0]
}; };
} }
private static uint[] _SU_Mlal_Mlsl_Mull_Ve_4H4S_8H4S_()
{
return new uint[]
{
0x0F402000u, // SMLAL V0.4S, V0.4H, V0.H[0]
0x0F406000u, // SMLSL V0.4S, V0.4H, V0.H[0]
0x0F40A000u, // SMULL V0.4S, V0.4H, V0.H[0]
0x2F402000u, // UMLAL V0.4S, V0.4H, V0.H[0]
0x2F406000u, // UMLSL V0.4S, V0.4H, V0.H[0]
0x2F40A000u // UMULL V0.4S, V0.4H, V0.H[0]
};
}
private static uint[] _SU_Mlal_Mlsl_Mull_Ve_2S2D_4S2D_()
{
return new uint[]
{
0x0F802000u, // SMLAL V0.2D, V0.2S, V0.S[0]
0x0F806000u, // SMLSL V0.2D, V0.2S, V0.S[0]
0x0F80A000u, // SMULL V0.2D, V0.2S, V0.S[0]
0x2F802000u, // UMLAL V0.2D, V0.2S, V0.S[0]
0x2F806000u, // UMLSL V0.2D, V0.2S, V0.S[0]
0x2F80A000u // UMULL V0.2D, V0.2S, V0.S[0]
};
}
#endregion #endregion
private const int RndCnt = 2; private const int RndCnt = 2;
@ -103,6 +129,61 @@ namespace Ryujinx.Tests.Cpu
CompareAgainstUnicorn(); CompareAgainstUnicorn();
} }
[Test, Pairwise]
public void SU_Mlal_Mlsl_Mull_Ve_4H4S_8H4S([ValueSource("_SU_Mlal_Mlsl_Mull_Ve_4H4S_8H4S_")] uint opcodes,
[Values(0u)] uint rd,
[Values(1u, 0u)] uint rn,
[Values(2u, 0u)] uint rm,
[ValueSource("_4H_")] [Random(RndCnt)] ulong z,
[ValueSource("_4H_")] [Random(RndCnt)] ulong a,
[ValueSource("_4H_")] [Random(RndCnt)] ulong b,
[Values(0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u)] uint index,
[Values(0b0u, 0b1u)] uint q) // <4H4S, 8H4S>
{
uint h = (index >> 2) & 1;
uint l = (index >> 1) & 1;
uint m = index & 1;
opcodes |= ((rm & 15) << 16) | ((rn & 31) << 5) | ((rd & 31) << 0);
opcodes |= (l << 21) | (m << 20) | (h << 11);
opcodes |= ((q & 1) << 30);
Vector128<float> v0 = MakeVectorE0E1(z, z);
Vector128<float> v1 = MakeVectorE0E1(q == 0u ? a : 0ul, q == 1u ? a : 0ul);
Vector128<float> v2 = MakeVectorE0E1(b, b * h);
SingleOpcode(opcodes, v0: v0, v1: v1, v2: v2);
CompareAgainstUnicorn();
}
[Test, Pairwise]
public void SU_Mlal_Mlsl_Mull_Ve_2S2D_4S2D([ValueSource("_SU_Mlal_Mlsl_Mull_Ve_2S2D_4S2D_")] uint opcodes,
[Values(0u)] uint rd,
[Values(1u, 0u)] uint rn,
[Values(2u, 0u)] uint rm,
[ValueSource("_2S_")] [Random(RndCnt)] ulong z,
[ValueSource("_2S_")] [Random(RndCnt)] ulong a,
[ValueSource("_2S_")] [Random(RndCnt)] ulong b,
[Values(0u, 1u, 2u, 3u)] uint index,
[Values(0b0u, 0b1u)] uint q) // <2S2D, 4S2D>
{
uint h = (index >> 1) & 1;
uint l = index & 1;
opcodes |= ((rm & 15) << 16) | ((rn & 31) << 5) | ((rd & 31) << 0);
opcodes |= (l << 21) | (h << 11);
opcodes |= ((q & 1) << 30);
Vector128<float> v0 = MakeVectorE0E1(z, z);
Vector128<float> v1 = MakeVectorE0E1(q == 0u ? a : 0ul, q == 1u ? a : 0ul);
Vector128<float> v2 = MakeVectorE0E1(b, b * h);
SingleOpcode(opcodes, v0: v0, v1: v1, v2: v2);
CompareAgainstUnicorn();
}
#endif #endif
} }
} }