From 8f7fcede7fa98c605925dc7b9316940960543bf1 Mon Sep 17 00:00:00 2001 From: LDj3SNuD <35856442+LDj3SNuD@users.noreply.github.com> Date: Tue, 29 Jan 2019 14:54:39 +0100 Subject: [PATCH] Add Smlal_Ve, Smlsl_Ve, Smull_Ve, Umlal_Ve, Umlsl_Ve, Umull_Ve Inst.; add Tests. Add Sse Opt. for Trn1/2_V and Uzp1/2_V Inst. Nits. (#566) * Update OpCodeTable.cs * Update InstEmitSimdArithmetic.cs * Update InstEmitSimdHelper.cs * Update CpuTestSimdRegElem.cs * Update InstEmitSimdMove.cs * Update InstEmitSimdCvt.cs * Update SoftFallback.cs * Update InstEmitSimdHelper.cs * Update SoftFloat.cs * Update CryptoHelper.cs * Update InstEmitSimdArithmetic.cs * Update InstEmitSimdCmp.cs * Address PR feedback. * Address PR feedback. --- ChocolArm64/Instructions/CryptoHelper.cs | 77 +++--- .../Instructions/InstEmitSimdArithmetic.cs | 97 ++++--- ChocolArm64/Instructions/InstEmitSimdCmp.cs | 30 +-- ChocolArm64/Instructions/InstEmitSimdCvt.cs | 10 +- .../Instructions/InstEmitSimdHelper.cs | 66 ++++- ChocolArm64/Instructions/InstEmitSimdMove.cs | 244 +++++++++++++----- ChocolArm64/Instructions/SoftFallback.cs | 9 +- ChocolArm64/Instructions/SoftFloat.cs | 8 +- ChocolArm64/OpCodeTable.cs | 6 + Ryujinx.Tests/Cpu/CpuTestSimdRegElem.cs | 81 ++++++ 10 files changed, 453 insertions(+), 175 deletions(-) diff --git a/ChocolArm64/Instructions/CryptoHelper.cs b/ChocolArm64/Instructions/CryptoHelper.cs index b38d79a8..e9b6ed5f 100644 --- a/ChocolArm64/Instructions/CryptoHelper.cs +++ b/ChocolArm64/Instructions/CryptoHelper.cs @@ -9,7 +9,7 @@ namespace ChocolArm64.Instructions static class CryptoHelper { #region "LookUp Tables" - private static byte[] _sBox = + private static readonly byte[] _sBox = new byte[] { 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76, 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0, @@ -29,7 +29,7 @@ namespace ChocolArm64.Instructions 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16 }; - private static byte[] _invSBox = + private static readonly byte[] _invSBox = new byte[] { 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38, 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb, 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87, 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb, @@ -49,7 +49,7 @@ namespace ChocolArm64.Instructions 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26, 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d }; - private static byte[] _gfMul02 = + private static readonly byte[] _gfMul02 = new byte[] { 0x00, 0x02, 0x04, 0x06, 0x08, 0x0a, 0x0c, 0x0e, 0x10, 0x12, 0x14, 0x16, 0x18, 0x1a, 0x1c, 0x1e, 0x20, 0x22, 0x24, 0x26, 0x28, 0x2a, 0x2c, 0x2e, 0x30, 0x32, 0x34, 0x36, 0x38, 0x3a, 0x3c, 0x3e, @@ -69,7 +69,7 @@ namespace ChocolArm64.Instructions 0xfb, 0xf9, 0xff, 0xfd, 0xf3, 0xf1, 0xf7, 0xf5, 0xeb, 0xe9, 0xef, 0xed, 0xe3, 0xe1, 0xe7, 0xe5 }; - private static byte[] _gfMul03 = + private static readonly byte[] _gfMul03 = new byte[] { 0x00, 0x03, 0x06, 0x05, 0x0c, 0x0f, 0x0a, 0x09, 0x18, 0x1b, 0x1e, 0x1d, 0x14, 0x17, 0x12, 0x11, 0x30, 0x33, 0x36, 0x35, 0x3c, 0x3f, 0x3a, 0x39, 0x28, 0x2b, 0x2e, 0x2d, 0x24, 0x27, 0x22, 0x21, @@ -89,7 +89,7 @@ namespace ChocolArm64.Instructions 0x0b, 0x08, 0x0d, 0x0e, 0x07, 0x04, 0x01, 0x02, 0x13, 0x10, 0x15, 0x16, 0x1f, 0x1c, 0x19, 0x1a }; - private static byte[] _gfMul09 = + private static readonly byte[] _gfMul09 = new byte[] { 0x00, 0x09, 0x12, 0x1b, 0x24, 0x2d, 0x36, 0x3f, 0x48, 0x41, 0x5a, 0x53, 0x6c, 0x65, 0x7e, 0x77, 0x90, 0x99, 0x82, 0x8b, 0xb4, 0xbd, 0xa6, 0xaf, 0xd8, 0xd1, 0xca, 0xc3, 0xfc, 0xf5, 0xee, 0xe7, @@ -109,7 +109,7 @@ namespace ChocolArm64.Instructions 0x31, 0x38, 0x23, 0x2a, 0x15, 0x1c, 0x07, 0x0e, 0x79, 0x70, 0x6b, 0x62, 0x5d, 0x54, 0x4f, 0x46 }; - private static byte[] _gfMul0B = + private static readonly byte[] _gfMul0B = new byte[] { 0x00, 0x0b, 0x16, 0x1d, 0x2c, 0x27, 0x3a, 0x31, 0x58, 0x53, 0x4e, 0x45, 0x74, 0x7f, 0x62, 0x69, 0xb0, 0xbb, 0xa6, 0xad, 0x9c, 0x97, 0x8a, 0x81, 0xe8, 0xe3, 0xfe, 0xf5, 0xc4, 0xcf, 0xd2, 0xd9, @@ -129,7 +129,7 @@ namespace ChocolArm64.Instructions 0xca, 0xc1, 0xdc, 0xd7, 0xe6, 0xed, 0xf0, 0xfb, 0x92, 0x99, 0x84, 0x8f, 0xbe, 0xb5, 0xa8, 0xa3 }; - private static byte[] _gfMul0D = + private static readonly byte[] _gfMul0D = new byte[] { 0x00, 0x0d, 0x1a, 0x17, 0x34, 0x39, 0x2e, 0x23, 0x68, 0x65, 0x72, 0x7f, 0x5c, 0x51, 0x46, 0x4b, 0xd0, 0xdd, 0xca, 0xc7, 0xe4, 0xe9, 0xfe, 0xf3, 0xb8, 0xb5, 0xa2, 0xaf, 0x8c, 0x81, 0x96, 0x9b, @@ -149,7 +149,7 @@ namespace ChocolArm64.Instructions 0xdc, 0xd1, 0xc6, 0xcb, 0xe8, 0xe5, 0xf2, 0xff, 0xb4, 0xb9, 0xae, 0xa3, 0x80, 0x8d, 0x9a, 0x97 }; - private static byte[] _gfMul0E = + private static readonly byte[] _gfMul0E = new byte[] { 0x00, 0x0e, 0x1c, 0x12, 0x38, 0x36, 0x24, 0x2a, 0x70, 0x7e, 0x6c, 0x62, 0x48, 0x46, 0x54, 0x5a, 0xe0, 0xee, 0xfc, 0xf2, 0xd8, 0xd6, 0xc4, 0xca, 0x90, 0x9e, 0x8c, 0x82, 0xa8, 0xa6, 0xb4, 0xba, @@ -169,9 +169,15 @@ namespace ChocolArm64.Instructions 0xd7, 0xd9, 0xcb, 0xc5, 0xef, 0xe1, 0xf3, 0xfd, 0xa7, 0xa9, 0xbb, 0xb5, 0x9f, 0x91, 0x83, 0x8d }; - private static byte[] _srPerm = { 0, 13, 10, 7, 4, 1, 14, 11, 8, 5, 2, 15, 12, 9, 6, 3 }; + private static readonly byte[] _srPerm = new byte[] + { + 0, 13, 10, 7, 4, 1, 14, 11, 8, 5, 2, 15, 12, 9, 6, 3 + }; - private static byte[] _isrPerm = { 0, 5, 10, 15, 4, 9, 14, 3, 8, 13, 2, 7, 12, 1, 6, 11 }; + private static readonly byte[] _isrPerm = new byte[] + { + 0, 5, 10, 15, 4, 9, 14, 3, 8, 13, 2, 7, 12, 1, 6, 11 + }; #endregion public static Vector128 AesInvMixColumns(Vector128 op) @@ -179,7 +185,7 @@ namespace ChocolArm64.Instructions byte[] inState = new byte[16]; byte[] outState = new byte[16]; - FromVectorToByteArray(inState, ref op); + FromVectorToByteArray(op, inState); for (int columns = 0; columns <= 3; columns++) { @@ -206,7 +212,7 @@ namespace ChocolArm64.Instructions byte[] inState = new byte[16]; byte[] outState = new byte[16]; - FromVectorToByteArray(inState, ref op); + FromVectorToByteArray(op, inState); for (int idx = 0; idx <= 15; idx++) { @@ -223,7 +229,7 @@ namespace ChocolArm64.Instructions byte[] inState = new byte[16]; byte[] outState = new byte[16]; - FromVectorToByteArray(inState, ref op); + FromVectorToByteArray(op, inState); for (int idx = 0; idx <= 15; idx++) { @@ -240,7 +246,7 @@ namespace ChocolArm64.Instructions byte[] inState = new byte[16]; byte[] outState = new byte[16]; - FromVectorToByteArray(inState, ref op); + FromVectorToByteArray(op, inState); for (int columns = 0; columns <= 3; columns++) { @@ -267,7 +273,7 @@ namespace ChocolArm64.Instructions byte[] inState = new byte[16]; byte[] outState = new byte[16]; - FromVectorToByteArray(inState, ref op); + FromVectorToByteArray(op, inState); for (int idx = 0; idx <= 15; idx++) { @@ -284,7 +290,7 @@ namespace ChocolArm64.Instructions byte[] inState = new byte[16]; byte[] outState = new byte[16]; - FromVectorToByteArray(inState, ref op); + FromVectorToByteArray(op, inState); for (int idx = 0; idx <= 15; idx++) { @@ -296,33 +302,30 @@ namespace ChocolArm64.Instructions return op; } - private static void FromVectorToByteArray(byte[] state, ref Vector128 op) - { - ulong uLongLow = VectorHelper.VectorExtractIntZx((op), (byte)0, 3); - ulong uLongHigh = VectorHelper.VectorExtractIntZx((op), (byte)1, 3); - - for (int idx = 0; idx <= 7; idx++) - { - state[idx + 0] = (byte)(uLongLow & 0xFFUL); - state[idx + 8] = (byte)(uLongHigh & 0xFFUL); - - uLongLow >>= 8; - uLongHigh >>= 8; - } - } - - private static void FromByteArrayToVector(byte[] state, ref Vector128 op) + private unsafe static void FromVectorToByteArray(Vector128 op, byte[] state) { if (!Sse2.IsSupported) { throw new PlatformNotSupportedException(); } - op = Sse.StaticCast(Sse2.SetVector128( - state[15], state[14], state[13], state[12], - state[11], state[10], state[9], state[8], - state[7], state[6], state[5], state[4], - state[3], state[2], state[1], state[0])); + fixed (byte* ptr = &state[0]) + { + Sse2.Store(ptr, Sse.StaticCast(op)); + } + } + + private unsafe static void FromByteArrayToVector(byte[] state, ref Vector128 op) + { + if (!Sse2.IsSupported) + { + throw new PlatformNotSupportedException(); + } + + fixed (byte* ptr = &state[0]) + { + op = Sse.StaticCast(Sse2.LoadVector128(ptr)); + } } } } diff --git a/ChocolArm64/Instructions/InstEmitSimdArithmetic.cs b/ChocolArm64/Instructions/InstEmitSimdArithmetic.cs index d1e71ecb..acb9f7f0 100644 --- a/ChocolArm64/Instructions/InstEmitSimdArithmetic.cs +++ b/ChocolArm64/Instructions/InstEmitSimdArithmetic.cs @@ -392,8 +392,7 @@ namespace ChocolArm64.Instructions public static void Fadd_S(ILEmitterCtx context) { - if (Optimizations.FastFP && Optimizations.UseSse - && Optimizations.UseSse2) + if (Optimizations.FastFP && Optimizations.UseSse2) { EmitScalarSseOrSse2OpF(context, nameof(Sse.AddScalar)); } @@ -408,8 +407,7 @@ namespace ChocolArm64.Instructions public static void Fadd_V(ILEmitterCtx context) { - if (Optimizations.FastFP && Optimizations.UseSse - && Optimizations.UseSse2) + if (Optimizations.FastFP && Optimizations.UseSse2) { EmitVectorSseOrSse2OpF(context, nameof(Sse.Add)); } @@ -470,8 +468,7 @@ namespace ChocolArm64.Instructions public static void Faddp_V(ILEmitterCtx context) { - if (Optimizations.FastFP && Optimizations.UseSse - && Optimizations.UseSse2) + if (Optimizations.FastFP && Optimizations.UseSse2) { EmitVectorPairwiseSseOrSse2OpF(context, nameof(Sse.Add)); } @@ -486,8 +483,7 @@ namespace ChocolArm64.Instructions public static void Fdiv_S(ILEmitterCtx context) { - if (Optimizations.FastFP && Optimizations.UseSse - && Optimizations.UseSse2) + if (Optimizations.FastFP && Optimizations.UseSse2) { EmitScalarSseOrSse2OpF(context, nameof(Sse.DivideScalar)); } @@ -502,8 +498,7 @@ namespace ChocolArm64.Instructions public static void Fdiv_V(ILEmitterCtx context) { - if (Optimizations.FastFP && Optimizations.UseSse - && Optimizations.UseSse2) + if (Optimizations.FastFP && Optimizations.UseSse2) { EmitVectorSseOrSse2OpF(context, nameof(Sse.Divide)); } @@ -564,8 +559,7 @@ namespace ChocolArm64.Instructions public static void Fmax_S(ILEmitterCtx context) { - if (Optimizations.FastFP && Optimizations.UseSse - && Optimizations.UseSse2) + if (Optimizations.FastFP && Optimizations.UseSse2) { EmitScalarSseOrSse2OpF(context, nameof(Sse.MaxScalar)); } @@ -580,8 +574,7 @@ namespace ChocolArm64.Instructions public static void Fmax_V(ILEmitterCtx context) { - if (Optimizations.FastFP && Optimizations.UseSse - && Optimizations.UseSse2) + if (Optimizations.FastFP && Optimizations.UseSse2) { EmitVectorSseOrSse2OpF(context, nameof(Sse.Max)); } @@ -612,8 +605,7 @@ namespace ChocolArm64.Instructions public static void Fmaxp_V(ILEmitterCtx context) { - if (Optimizations.FastFP && Optimizations.UseSse - && Optimizations.UseSse2) + if (Optimizations.FastFP && Optimizations.UseSse2) { EmitVectorPairwiseSseOrSse2OpF(context, nameof(Sse.Max)); } @@ -628,8 +620,7 @@ namespace ChocolArm64.Instructions public static void Fmin_S(ILEmitterCtx context) { - if (Optimizations.FastFP && Optimizations.UseSse - && Optimizations.UseSse2) + if (Optimizations.FastFP && Optimizations.UseSse2) { EmitScalarSseOrSse2OpF(context, nameof(Sse.MinScalar)); } @@ -644,8 +635,7 @@ namespace ChocolArm64.Instructions public static void Fmin_V(ILEmitterCtx context) { - if (Optimizations.FastFP && Optimizations.UseSse - && Optimizations.UseSse2) + if (Optimizations.FastFP && Optimizations.UseSse2) { EmitVectorSseOrSse2OpF(context, nameof(Sse.Min)); } @@ -676,8 +666,7 @@ namespace ChocolArm64.Instructions public static void Fminp_V(ILEmitterCtx context) { - if (Optimizations.FastFP && Optimizations.UseSse - && Optimizations.UseSse2) + if (Optimizations.FastFP && Optimizations.UseSse2) { EmitVectorPairwiseSseOrSse2OpF(context, nameof(Sse.Min)); } @@ -984,8 +973,7 @@ namespace ChocolArm64.Instructions public static void Fmul_S(ILEmitterCtx context) { - if (Optimizations.FastFP && Optimizations.UseSse - && Optimizations.UseSse2) + if (Optimizations.FastFP && Optimizations.UseSse2) { EmitScalarSseOrSse2OpF(context, nameof(Sse.MultiplyScalar)); } @@ -1005,8 +993,7 @@ namespace ChocolArm64.Instructions public static void Fmul_V(ILEmitterCtx context) { - if (Optimizations.FastFP && Optimizations.UseSse - && Optimizations.UseSse2) + if (Optimizations.FastFP && Optimizations.UseSse2) { EmitVectorSseOrSse2OpF(context, nameof(Sse.Multiply)); } @@ -1753,8 +1740,7 @@ namespace ChocolArm64.Instructions public static void Fsqrt_S(ILEmitterCtx context) { - if (Optimizations.FastFP && Optimizations.UseSse - && Optimizations.UseSse2) + if (Optimizations.FastFP && Optimizations.UseSse2) { EmitScalarSseOrSse2OpF(context, nameof(Sse.SqrtScalar)); } @@ -1769,8 +1755,7 @@ namespace ChocolArm64.Instructions public static void Fsqrt_V(ILEmitterCtx context) { - if (Optimizations.FastFP && Optimizations.UseSse - && Optimizations.UseSse2) + if (Optimizations.FastFP && Optimizations.UseSse2) { EmitVectorSseOrSse2OpF(context, nameof(Sse.Sqrt)); } @@ -1785,8 +1770,7 @@ namespace ChocolArm64.Instructions public static void Fsub_S(ILEmitterCtx context) { - if (Optimizations.FastFP && Optimizations.UseSse - && Optimizations.UseSse2) + if (Optimizations.FastFP && Optimizations.UseSse2) { EmitScalarSseOrSse2OpF(context, nameof(Sse.SubtractScalar)); } @@ -1801,8 +1785,7 @@ namespace ChocolArm64.Instructions public static void Fsub_V(ILEmitterCtx context) { - if (Optimizations.FastFP && Optimizations.UseSse - && Optimizations.UseSse2) + if (Optimizations.FastFP && Optimizations.UseSse2) { EmitVectorSseOrSse2OpF(context, nameof(Sse.Subtract)); } @@ -2268,6 +2251,15 @@ namespace ChocolArm64.Instructions } } + public static void Smlal_Ve(ILEmitterCtx context) + { + EmitVectorWidenTernaryOpByElemSx(context, () => + { + context.Emit(OpCodes.Mul); + context.Emit(OpCodes.Add); + }); + } + public static void Smlsl_V(ILEmitterCtx context) { OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp; @@ -2319,11 +2311,25 @@ namespace ChocolArm64.Instructions } } + public static void Smlsl_Ve(ILEmitterCtx context) + { + EmitVectorWidenTernaryOpByElemSx(context, () => + { + context.Emit(OpCodes.Mul); + context.Emit(OpCodes.Sub); + }); + } + public static void Smull_V(ILEmitterCtx context) { EmitVectorWidenRnRmBinaryOpSx(context, () => context.Emit(OpCodes.Mul)); } + public static void Smull_Ve(ILEmitterCtx context) + { + EmitVectorWidenBinaryOpByElemSx(context, () => context.Emit(OpCodes.Mul)); + } + public static void Sqabs_S(ILEmitterCtx context) { EmitScalarSaturatingUnaryOpSx(context, () => EmitAbs(context)); @@ -2929,6 +2935,15 @@ namespace ChocolArm64.Instructions } } + public static void Umlal_Ve(ILEmitterCtx context) + { + EmitVectorWidenTernaryOpByElemZx(context, () => + { + context.Emit(OpCodes.Mul); + context.Emit(OpCodes.Add); + }); + } + public static void Umlsl_V(ILEmitterCtx context) { OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp; @@ -2980,11 +2995,25 @@ namespace ChocolArm64.Instructions } } + public static void Umlsl_Ve(ILEmitterCtx context) + { + EmitVectorWidenTernaryOpByElemZx(context, () => + { + context.Emit(OpCodes.Mul); + context.Emit(OpCodes.Sub); + }); + } + public static void Umull_V(ILEmitterCtx context) { EmitVectorWidenRnRmBinaryOpZx(context, () => context.Emit(OpCodes.Mul)); } + public static void Umull_Ve(ILEmitterCtx context) + { + EmitVectorWidenBinaryOpByElemZx(context, () => context.Emit(OpCodes.Mul)); + } + public static void Uqadd_S(ILEmitterCtx context) { EmitScalarSaturatingBinaryOpZx(context, SaturatingFlags.Add); diff --git a/ChocolArm64/Instructions/InstEmitSimdCmp.cs b/ChocolArm64/Instructions/InstEmitSimdCmp.cs index e1184375..fdf3951e 100644 --- a/ChocolArm64/Instructions/InstEmitSimdCmp.cs +++ b/ChocolArm64/Instructions/InstEmitSimdCmp.cs @@ -173,8 +173,7 @@ namespace ChocolArm64.Instructions public static void Fcmeq_S(ILEmitterCtx context) { - if (Optimizations.FastFP && Optimizations.UseSse - && Optimizations.UseSse2) + if (Optimizations.FastFP && Optimizations.UseSse2) { EmitCmpSseOrSse2OpF(context, nameof(Sse.CompareEqualScalar), scalar: true); } @@ -186,8 +185,7 @@ namespace ChocolArm64.Instructions public static void Fcmeq_V(ILEmitterCtx context) { - if (Optimizations.FastFP && Optimizations.UseSse - && Optimizations.UseSse2) + if (Optimizations.FastFP && Optimizations.UseSse2) { EmitCmpSseOrSse2OpF(context, nameof(Sse.CompareEqual), scalar: false); } @@ -199,8 +197,7 @@ namespace ChocolArm64.Instructions public static void Fcmge_S(ILEmitterCtx context) { - if (Optimizations.FastFP && Optimizations.UseSse - && Optimizations.UseSse2) + if (Optimizations.FastFP && Optimizations.UseSse2) { EmitCmpSseOrSse2OpF(context, nameof(Sse.CompareGreaterThanOrEqualScalar), scalar: true); } @@ -212,8 +209,7 @@ namespace ChocolArm64.Instructions public static void Fcmge_V(ILEmitterCtx context) { - if (Optimizations.FastFP && Optimizations.UseSse - && Optimizations.UseSse2) + if (Optimizations.FastFP && Optimizations.UseSse2) { EmitCmpSseOrSse2OpF(context, nameof(Sse.CompareGreaterThanOrEqual), scalar: false); } @@ -225,8 +221,7 @@ namespace ChocolArm64.Instructions public static void Fcmgt_S(ILEmitterCtx context) { - if (Optimizations.FastFP && Optimizations.UseSse - && Optimizations.UseSse2) + if (Optimizations.FastFP && Optimizations.UseSse2) { EmitCmpSseOrSse2OpF(context, nameof(Sse.CompareGreaterThanScalar), scalar: true); } @@ -238,8 +233,7 @@ namespace ChocolArm64.Instructions public static void Fcmgt_V(ILEmitterCtx context) { - if (Optimizations.FastFP && Optimizations.UseSse - && Optimizations.UseSse2) + if (Optimizations.FastFP && Optimizations.UseSse2) { EmitCmpSseOrSse2OpF(context, nameof(Sse.CompareGreaterThan), scalar: false); } @@ -251,8 +245,7 @@ namespace ChocolArm64.Instructions public static void Fcmle_S(ILEmitterCtx context) { - if (Optimizations.FastFP && Optimizations.UseSse - && Optimizations.UseSse2) + if (Optimizations.FastFP && Optimizations.UseSse2) { EmitCmpSseOrSse2OpF(context, nameof(Sse.CompareGreaterThanOrEqualScalar), scalar: true, isLeOrLt: true); } @@ -264,8 +257,7 @@ namespace ChocolArm64.Instructions public static void Fcmle_V(ILEmitterCtx context) { - if (Optimizations.FastFP && Optimizations.UseSse - && Optimizations.UseSse2) + if (Optimizations.FastFP && Optimizations.UseSse2) { EmitCmpSseOrSse2OpF(context, nameof(Sse.CompareGreaterThanOrEqual), scalar: false, isLeOrLt: true); } @@ -277,8 +269,7 @@ namespace ChocolArm64.Instructions public static void Fcmlt_S(ILEmitterCtx context) { - if (Optimizations.FastFP && Optimizations.UseSse - && Optimizations.UseSse2) + if (Optimizations.FastFP && Optimizations.UseSse2) { EmitCmpSseOrSse2OpF(context, nameof(Sse.CompareGreaterThanScalar), scalar: true, isLeOrLt: true); } @@ -290,8 +281,7 @@ namespace ChocolArm64.Instructions public static void Fcmlt_V(ILEmitterCtx context) { - if (Optimizations.FastFP && Optimizations.UseSse - && Optimizations.UseSse2) + if (Optimizations.FastFP && Optimizations.UseSse2) { EmitCmpSseOrSse2OpF(context, nameof(Sse.CompareGreaterThan), scalar: false, isLeOrLt: true); } diff --git a/ChocolArm64/Instructions/InstEmitSimdCvt.cs b/ChocolArm64/Instructions/InstEmitSimdCvt.cs index fd6146b3..2eac3194 100644 --- a/ChocolArm64/Instructions/InstEmitSimdCvt.cs +++ b/ChocolArm64/Instructions/InstEmitSimdCvt.cs @@ -78,7 +78,6 @@ namespace ChocolArm64.Instructions if (Optimizations.UseSse2 && sizeF == 1) { - Type[] typesMov = new Type[] { typeof(Vector128), typeof(Vector128) }; Type[] typesCvt = new Type[] { typeof(Vector128) }; string nameMov = op.RegisterSize == RegisterSize.Simd128 @@ -88,7 +87,7 @@ namespace ChocolArm64.Instructions context.EmitLdvec(op.Rn); context.Emit(OpCodes.Dup); - context.EmitCall(typeof(Sse).GetMethod(nameMov, typesMov)); + context.EmitCall(typeof(Sse).GetMethod(nameMov)); context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ConvertToVector128Double), typesCvt)); @@ -144,7 +143,6 @@ namespace ChocolArm64.Instructions if (Optimizations.UseSse2 && sizeF == 1) { - Type[] typesMov = new Type[] { typeof(Vector128), typeof(Vector128) }; Type[] typesCvt = new Type[] { typeof(Vector128) }; string nameMov = op.RegisterSize == RegisterSize.Simd128 @@ -154,15 +152,15 @@ namespace ChocolArm64.Instructions context.EmitLdvec(op.Rd); VectorHelper.EmitCall(context, nameof(VectorHelper.VectorSingleZero)); - context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.MoveLowToHigh), typesMov)); + context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.MoveLowToHigh))); EmitLdvecWithCastToDouble(context, op.Rn); context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ConvertToVector128Single), typesCvt)); context.Emit(OpCodes.Dup); - context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.MoveLowToHigh), typesMov)); + context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.MoveLowToHigh))); - context.EmitCall(typeof(Sse).GetMethod(nameMov, typesMov)); + context.EmitCall(typeof(Sse).GetMethod(nameMov)); context.EmitStvec(op.Rd); } diff --git a/ChocolArm64/Instructions/InstEmitSimdHelper.cs b/ChocolArm64/Instructions/InstEmitSimdHelper.cs index cea481a6..5a44e1a1 100644 --- a/ChocolArm64/Instructions/InstEmitSimdHelper.cs +++ b/ChocolArm64/Instructions/InstEmitSimdHelper.cs @@ -642,21 +642,21 @@ namespace ChocolArm64.Instructions { OpCodeSimdRegElem64 op = (OpCodeSimdRegElem64)context.CurrOp; - EmitVectorOpByElem(context, emit, op.Index, false, true); + EmitVectorOpByElem(context, emit, op.Index, ternary: false, signed: true); } public static void EmitVectorBinaryOpByElemZx(ILEmitterCtx context, Action emit) { OpCodeSimdRegElem64 op = (OpCodeSimdRegElem64)context.CurrOp; - EmitVectorOpByElem(context, emit, op.Index, false, false); + EmitVectorOpByElem(context, emit, op.Index, ternary: false, signed: false); } public static void EmitVectorTernaryOpByElemZx(ILEmitterCtx context, Action emit) { OpCodeSimdRegElem64 op = (OpCodeSimdRegElem64)context.CurrOp; - EmitVectorOpByElem(context, emit, op.Index, true, false); + EmitVectorOpByElem(context, emit, op.Index, ternary: true, signed: false); } public static void EmitVectorOpByElem(ILEmitterCtx context, Action emit, int elem, bool ternary, bool signed) @@ -809,6 +809,64 @@ namespace ChocolArm64.Instructions context.EmitStvec(op.Rd); } + public static void EmitVectorWidenBinaryOpByElemSx(ILEmitterCtx context, Action emit) + { + OpCodeSimdRegElem64 op = (OpCodeSimdRegElem64)context.CurrOp; + + EmitVectorWidenOpByElem(context, emit, op.Index, ternary: false, signed: true); + } + + public static void EmitVectorWidenBinaryOpByElemZx(ILEmitterCtx context, Action emit) + { + OpCodeSimdRegElem64 op = (OpCodeSimdRegElem64)context.CurrOp; + + EmitVectorWidenOpByElem(context, emit, op.Index, ternary: false, signed: false); + } + + public static void EmitVectorWidenTernaryOpByElemSx(ILEmitterCtx context, Action emit) + { + OpCodeSimdRegElem64 op = (OpCodeSimdRegElem64)context.CurrOp; + + EmitVectorWidenOpByElem(context, emit, op.Index, ternary: true, signed: true); + } + + public static void EmitVectorWidenTernaryOpByElemZx(ILEmitterCtx context, Action emit) + { + OpCodeSimdRegElem64 op = (OpCodeSimdRegElem64)context.CurrOp; + + EmitVectorWidenOpByElem(context, emit, op.Index, ternary: true, signed: false); + } + + public static void EmitVectorWidenOpByElem(ILEmitterCtx context, Action emit, int elem, bool ternary, bool signed) + { + OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp; + + int elems = 8 >> op.Size; + + int part = op.RegisterSize == RegisterSize.Simd128 ? elems : 0; + + EmitVectorExtract(context, op.Rm, elem, op.Size, signed); + context.EmitSttmp(); + + for (int index = 0; index < elems; index++) + { + if (ternary) + { + EmitVectorExtract(context, op.Rd, index, op.Size + 1, signed); + } + + EmitVectorExtract(context, op.Rn, part + index, op.Size, signed); + context.EmitLdtmp(); + + emit(); + + EmitVectorInsertTmp(context, index, op.Size + 1); + } + + context.EmitLdvectmp(); + context.EmitStvec(op.Rd); + } + public static void EmitVectorPairwiseOpSx(ILEmitterCtx context, Action emit) { EmitVectorPairwiseOp(context, emit, true); @@ -1416,7 +1474,7 @@ namespace ChocolArm64.Instructions if (Optimizations.UseSse) { //TODO: Use Sse2.MoveScalar once it is fixed, - //as of the time of writing it just crashes the JIT (SDK 2.1.500). + //as of the time of writing it just crashes the JIT (SDK 2.1.503). /*Type[] typesMov = new Type[] { typeof(Vector128) }; diff --git a/ChocolArm64/Instructions/InstEmitSimdMove.cs b/ChocolArm64/Instructions/InstEmitSimdMove.cs index d40ccff9..2844dfdf 100644 --- a/ChocolArm64/Instructions/InstEmitSimdMove.cs +++ b/ChocolArm64/Instructions/InstEmitSimdMove.cs @@ -12,6 +12,34 @@ namespace ChocolArm64.Instructions { static partial class InstEmit { +#region "Masks" + private static readonly long[] _masksE0_TrnUzpXtn = new long[] + { + 14L << 56 | 12L << 48 | 10L << 40 | 08L << 32 | 06L << 24 | 04L << 16 | 02L << 8 | 00L << 0, + 13L << 56 | 12L << 48 | 09L << 40 | 08L << 32 | 05L << 24 | 04L << 16 | 01L << 8 | 00L << 0, + 11L << 56 | 10L << 48 | 09L << 40 | 08L << 32 | 03L << 24 | 02L << 16 | 01L << 8 | 00L << 0 + }; + + private static readonly long[] _masksE1_TrnUzp = new long[] + { + 15L << 56 | 13L << 48 | 11L << 40 | 09L << 32 | 07L << 24 | 05L << 16 | 03L << 8 | 01L << 0, + 15L << 56 | 14L << 48 | 11L << 40 | 10L << 32 | 07L << 24 | 06L << 16 | 03L << 8 | 02L << 0, + 15L << 56 | 14L << 48 | 13L << 40 | 12L << 32 | 07L << 24 | 06L << 16 | 05L << 8 | 04L << 0 + }; + + private static readonly long[] _masksE0_Uzp = new long[] + { + 13L << 56 | 09L << 48 | 05L << 40 | 01L << 32 | 12L << 24 | 08L << 16 | 04L << 8 | 00L << 0, + 11L << 56 | 10L << 48 | 03L << 40 | 02L << 32 | 09L << 24 | 08L << 16 | 01L << 8 | 00L << 0 + }; + + private static readonly long[] _masksE1_Uzp = new long[] + { + 15L << 56 | 11L << 48 | 07L << 40 | 03L << 32 | 14L << 24 | 10L << 16 | 06L << 8 | 02L << 0, + 15L << 56 | 14L << 48 | 07L << 40 | 06L << 32 | 13L << 24 | 12L << 16 | 05L << 8 | 04L << 0 + }; +#endregion + public static void Dup_Gp(ILEmitterCtx context) { OpCodeSimdIns64 op = (OpCodeSimdIns64)context.CurrOp; @@ -379,15 +407,6 @@ namespace ChocolArm64.Instructions if (Optimizations.UseSsse3) { - long[] masks = new long[] - { - 14L << 56 | 12L << 48 | 10L << 40 | 08L << 32 | 06L << 24 | 04L << 16 | 02L << 8 | 00L << 0, - 13L << 56 | 12L << 48 | 09L << 40 | 08L << 32 | 05L << 24 | 04L << 16 | 01L << 8 | 00L << 0, - 11L << 56 | 10L << 48 | 09L << 40 | 08L << 32 | 03L << 24 | 02L << 16 | 01L << 8 | 00L << 0 - }; - - Type[] typesMov = new Type[] { typeof(Vector128), typeof(Vector128) }; - Type[] typesSfl = new Type[] { typeof(Vector128), typeof(Vector128) }; Type[] typesSve = new Type[] { typeof(long), typeof(long) }; string nameMov = op.RegisterSize == RegisterSize.Simd128 @@ -397,18 +416,18 @@ namespace ChocolArm64.Instructions context.EmitLdvec(op.Rd); VectorHelper.EmitCall(context, nameof(VectorHelper.VectorSingleZero)); - context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.MoveLowToHigh), typesMov)); + context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.MoveLowToHigh))); - EmitLdvecWithSignedCast(context, op.Rn, 0); + EmitLdvecWithSignedCast(context, op.Rn, 0); // value - context.EmitLdc_I8(masks[op.Size]); - context.Emit(OpCodes.Dup); + context.EmitLdc_I8(_masksE0_TrnUzpXtn[op.Size]); // mask + context.Emit(OpCodes.Dup); // mask context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.SetVector128), typesSve)); - context.EmitCall(typeof(Ssse3).GetMethod(nameof(Ssse3.Shuffle), typesSfl)); + context.EmitCall(typeof(Ssse3).GetMethod(nameof(Ssse3.Shuffle), GetTypesSflUpk(0))); - context.EmitCall(typeof(Sse).GetMethod(nameMov, typesMov)); + context.EmitCall(typeof(Sse).GetMethod(nameMov)); context.EmitStvec(op.Rd); } @@ -465,22 +484,61 @@ namespace ChocolArm64.Instructions { OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp; - int words = op.GetBitsCount() >> 4; - int pairs = words >> op.Size; - - for (int index = 0; index < pairs; index++) + if (Optimizations.UseSsse3) { - int idx = index << 1; + Type[] typesSve = new Type[] { typeof(long), typeof(long) }; - EmitVectorExtractZx(context, op.Rn, idx + part, op.Size); - EmitVectorExtractZx(context, op.Rm, idx + part, op.Size); + string nameUpk = part == 0 + ? nameof(Sse2.UnpackLow) + : nameof(Sse2.UnpackHigh); - EmitVectorInsertTmp(context, idx + 1, op.Size); - EmitVectorInsertTmp(context, idx, op.Size); + EmitLdvecWithSignedCast(context, op.Rn, op.Size); // value + + if (op.Size < 3) + { + context.EmitLdc_I8(_masksE1_TrnUzp [op.Size]); // maskE1 + context.EmitLdc_I8(_masksE0_TrnUzpXtn[op.Size]); // maskE0 + + context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.SetVector128), typesSve)); + + context.EmitCall(typeof(Ssse3).GetMethod(nameof(Ssse3.Shuffle), GetTypesSflUpk(0))); + } + + EmitLdvecWithSignedCast(context, op.Rm, op.Size); // value + + if (op.Size < 3) + { + context.EmitLdc_I8(_masksE1_TrnUzp [op.Size]); // maskE1 + context.EmitLdc_I8(_masksE0_TrnUzpXtn[op.Size]); // maskE0 + + context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.SetVector128), typesSve)); + + context.EmitCall(typeof(Ssse3).GetMethod(nameof(Ssse3.Shuffle), GetTypesSflUpk(0))); + } + + context.EmitCall(typeof(Sse2).GetMethod(nameUpk, GetTypesSflUpk(op.Size))); + + EmitStvecWithSignedCast(context, op.Rd, op.Size); } + else + { + int words = op.GetBitsCount() >> 4; + int pairs = words >> op.Size; - context.EmitLdvectmp(); - context.EmitStvec(op.Rd); + for (int index = 0; index < pairs; index++) + { + int idx = index << 1; + + EmitVectorExtractZx(context, op.Rn, idx + part, op.Size); + EmitVectorExtractZx(context, op.Rm, idx + part, op.Size); + + EmitVectorInsertTmp(context, idx + 1, op.Size); + EmitVectorInsertTmp(context, idx, op.Size); + } + + context.EmitLdvectmp(); + context.EmitStvec(op.Rd); + } if (op.RegisterSize == RegisterSize.Simd64) { @@ -492,26 +550,91 @@ namespace ChocolArm64.Instructions { OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp; - int words = op.GetBitsCount() >> 4; - int pairs = words >> op.Size; - - for (int index = 0; index < pairs; index++) + if (Optimizations.UseSsse3) { - int idx = index << 1; + Type[] typesSve = new Type[] { typeof(long), typeof(long) }; - EmitVectorExtractZx(context, op.Rn, idx + part, op.Size); - EmitVectorExtractZx(context, op.Rm, idx + part, op.Size); + string nameUpk = part == 0 + ? nameof(Sse2.UnpackLow) + : nameof(Sse2.UnpackHigh); - EmitVectorInsertTmp(context, pairs + index, op.Size); - EmitVectorInsertTmp(context, index, op.Size); + if (op.RegisterSize == RegisterSize.Simd128) + { + EmitLdvecWithSignedCast(context, op.Rn, op.Size); // value + + if (op.Size < 3) + { + context.EmitLdc_I8(_masksE1_TrnUzp [op.Size]); // maskE1 + context.EmitLdc_I8(_masksE0_TrnUzpXtn[op.Size]); // maskE0 + + context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.SetVector128), typesSve)); + + context.EmitCall(typeof(Ssse3).GetMethod(nameof(Ssse3.Shuffle), GetTypesSflUpk(0))); + } + + EmitLdvecWithSignedCast(context, op.Rm, op.Size); // value + + if (op.Size < 3) + { + context.EmitLdc_I8(_masksE1_TrnUzp [op.Size]); // maskE1 + context.EmitLdc_I8(_masksE0_TrnUzpXtn[op.Size]); // maskE0 + + context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.SetVector128), typesSve)); + + context.EmitCall(typeof(Ssse3).GetMethod(nameof(Ssse3.Shuffle), GetTypesSflUpk(0))); + } + + context.EmitCall(typeof(Sse2).GetMethod(nameUpk, GetTypesSflUpk(3))); + + EmitStvecWithSignedCast(context, op.Rd, op.Size); + } + else + { + EmitLdvecWithSignedCast(context, op.Rn, op.Size); + EmitLdvecWithSignedCast(context, op.Rm, op.Size); + + context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.UnpackLow), GetTypesSflUpk(op.Size))); // value + + if (op.Size < 2) + { + context.EmitLdc_I8(_masksE1_Uzp[op.Size]); // maskE1 + context.EmitLdc_I8(_masksE0_Uzp[op.Size]); // maskE0 + + context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.SetVector128), typesSve)); + + context.EmitCall(typeof(Ssse3).GetMethod(nameof(Ssse3.Shuffle), GetTypesSflUpk(0))); + } + + VectorHelper.EmitCall(context, nameof(VectorHelper.VectorInt64Zero)); + + context.EmitCall(typeof(Sse2).GetMethod(nameUpk, GetTypesSflUpk(3))); + + EmitStvecWithSignedCast(context, op.Rd, op.Size); + } } - - context.EmitLdvectmp(); - context.EmitStvec(op.Rd); - - if (op.RegisterSize == RegisterSize.Simd64) + else { - EmitVectorZeroUpper(context, op.Rd); + int words = op.GetBitsCount() >> 4; + int pairs = words >> op.Size; + + for (int index = 0; index < pairs; index++) + { + int idx = index << 1; + + EmitVectorExtractZx(context, op.Rn, idx + part, op.Size); + EmitVectorExtractZx(context, op.Rm, idx + part, op.Size); + + EmitVectorInsertTmp(context, pairs + index, op.Size); + EmitVectorInsertTmp(context, index, op.Size); + } + + context.EmitLdvectmp(); + context.EmitStvec(op.Rd); + + if (op.RegisterSize == RegisterSize.Simd64) + { + EmitVectorZeroUpper(context, op.Rd); + } } } @@ -521,36 +644,26 @@ namespace ChocolArm64.Instructions if (Optimizations.UseSse2) { - EmitLdvecWithUnsignedCast(context, op.Rn, op.Size); - EmitLdvecWithUnsignedCast(context, op.Rm, op.Size); - - Type[] types = new Type[] - { - VectorUIntTypesPerSizeLog2[op.Size], - VectorUIntTypesPerSizeLog2[op.Size] - }; - - string name = part == 0 || (part != 0 && op.RegisterSize == RegisterSize.Simd64) + string nameUpk = part == 0 ? nameof(Sse2.UnpackLow) : nameof(Sse2.UnpackHigh); - context.EmitCall(typeof(Sse2).GetMethod(name, types)); + EmitLdvecWithSignedCast(context, op.Rn, op.Size); + EmitLdvecWithSignedCast(context, op.Rm, op.Size); - if (op.RegisterSize == RegisterSize.Simd64 && part != 0) + if (op.RegisterSize == RegisterSize.Simd128) { - context.EmitLdc_I4(8); + context.EmitCall(typeof(Sse2).GetMethod(nameUpk, GetTypesSflUpk(op.Size))); + } + else + { + context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.UnpackLow), GetTypesSflUpk(op.Size))); + VectorHelper.EmitCall(context, nameof(VectorHelper.VectorInt64Zero)); - Type[] shTypes = new Type[] { VectorUIntTypesPerSizeLog2[op.Size], typeof(byte) }; - - context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical128BitLane), shTypes)); + context.EmitCall(typeof(Sse2).GetMethod(nameUpk, GetTypesSflUpk(3))); } - EmitStvecWithUnsignedCast(context, op.Rd, op.Size); - - if (op.RegisterSize == RegisterSize.Simd64 && part == 0) - { - EmitVectorZeroUpper(context, op.Rd); - } + EmitStvecWithSignedCast(context, op.Rd, op.Size); } else { @@ -579,5 +692,10 @@ namespace ChocolArm64.Instructions } } } + + private static Type[] GetTypesSflUpk(int size) + { + return new Type[] { VectorIntTypesPerSizeLog2[size], VectorIntTypesPerSizeLog2[size] }; + } } } diff --git a/ChocolArm64/Instructions/SoftFallback.cs b/ChocolArm64/Instructions/SoftFallback.cs index def95343..16638894 100644 --- a/ChocolArm64/Instructions/SoftFallback.cs +++ b/ChocolArm64/Instructions/SoftFallback.cs @@ -664,7 +664,7 @@ namespace ChocolArm64.Instructions for (int bit = highBit; bit >= 0; bit--) { - if (((value >> bit) & 0b1) != 0) + if (((int)(value >> bit) & 0b1) != 0) { return (ulong)(highBit - bit); } @@ -688,7 +688,7 @@ namespace ChocolArm64.Instructions do { nibbleIdx -= 4; - preCount = ClzNibbleTbl[(value >> nibbleIdx) & 0b1111]; + preCount = ClzNibbleTbl[(int)(value >> nibbleIdx) & 0b1111]; count += preCount; } while (preCount == 4); @@ -698,11 +698,6 @@ namespace ChocolArm64.Instructions public static ulong CountSetBits8(ulong value) // "size" is 8 (SIMD&FP Inst.). { - if (value == 0xfful) - { - return 8ul; - } - value = ((value >> 1) & 0x55ul) + (value & 0x55ul); value = ((value >> 2) & 0x33ul) + (value & 0x33ul); diff --git a/ChocolArm64/Instructions/SoftFloat.cs b/ChocolArm64/Instructions/SoftFloat.cs index 39d279de..3521ad15 100644 --- a/ChocolArm64/Instructions/SoftFloat.cs +++ b/ChocolArm64/Instructions/SoftFloat.cs @@ -1545,9 +1545,9 @@ namespace ChocolArm64.Instructions return -value; } - private static float ZerosOrOnes(bool zeros) + private static float ZerosOrOnes(bool ones) { - return BitConverter.Int32BitsToSingle(!zeros ? 0 : -1); + return BitConverter.Int32BitsToSingle(ones ? -1 : 0); } private static float FPUnpack( @@ -2629,9 +2629,9 @@ namespace ChocolArm64.Instructions return -value; } - private static double ZerosOrOnes(bool zeros) + private static double ZerosOrOnes(bool ones) { - return BitConverter.Int64BitsToDouble(!zeros ? 0L : -1L); + return BitConverter.Int64BitsToDouble(ones ? -1L : 0L); } private static double FPUnpack( diff --git a/ChocolArm64/OpCodeTable.cs b/ChocolArm64/OpCodeTable.cs index 34b420fb..db7a4ca9 100644 --- a/ChocolArm64/OpCodeTable.cs +++ b/ChocolArm64/OpCodeTable.cs @@ -445,9 +445,12 @@ namespace ChocolArm64 SetA64("0x001110<<1xxxxx011011xxxxxxxxxx", InstEmit.Smin_V, typeof(OpCodeSimdReg64)); SetA64("0x001110<<1xxxxx101011xxxxxxxxxx", InstEmit.Sminp_V, typeof(OpCodeSimdReg64)); SetA64("0x001110<<1xxxxx100000xxxxxxxxxx", InstEmit.Smlal_V, typeof(OpCodeSimdReg64)); + SetA64("0x001111xxxxxxxx0010x0xxxxxxxxxx", InstEmit.Smlal_Ve, typeof(OpCodeSimdRegElem64)); SetA64("0x001110<<1xxxxx101000xxxxxxxxxx", InstEmit.Smlsl_V, typeof(OpCodeSimdReg64)); + SetA64("0x001111xxxxxxxx0110x0xxxxxxxxxx", InstEmit.Smlsl_Ve, typeof(OpCodeSimdRegElem64)); SetA64("0x001110000xxxxx001011xxxxxxxxxx", InstEmit.Smov_S, typeof(OpCodeSimdIns64)); SetA64("0x001110<<1xxxxx110000xxxxxxxxxx", InstEmit.Smull_V, typeof(OpCodeSimdReg64)); + SetA64("0x001111xxxxxxxx1010x0xxxxxxxxxx", InstEmit.Smull_Ve, typeof(OpCodeSimdRegElem64)); SetA64("01011110xx100000011110xxxxxxxxxx", InstEmit.Sqabs_S, typeof(OpCodeSimd64)); SetA64("0>001110<<100000011110xxxxxxxxxx", InstEmit.Sqabs_V, typeof(OpCodeSimd64)); SetA64("01011110xx1xxxxx000011xxxxxxxxxx", InstEmit.Sqadd_S, typeof(OpCodeSimdReg64)); @@ -534,9 +537,12 @@ namespace ChocolArm64 SetA64("0x101110<<1xxxxx011011xxxxxxxxxx", InstEmit.Umin_V, typeof(OpCodeSimdReg64)); SetA64("0x101110<<1xxxxx101011xxxxxxxxxx", InstEmit.Uminp_V, typeof(OpCodeSimdReg64)); SetA64("0x101110<<1xxxxx100000xxxxxxxxxx", InstEmit.Umlal_V, typeof(OpCodeSimdReg64)); + SetA64("0x101111xxxxxxxx0010x0xxxxxxxxxx", InstEmit.Umlal_Ve, typeof(OpCodeSimdRegElem64)); SetA64("0x101110<<1xxxxx101000xxxxxxxxxx", InstEmit.Umlsl_V, typeof(OpCodeSimdReg64)); + SetA64("0x101111xxxxxxxx0110x0xxxxxxxxxx", InstEmit.Umlsl_Ve, typeof(OpCodeSimdRegElem64)); SetA64("0x001110000xxxxx001111xxxxxxxxxx", InstEmit.Umov_S, typeof(OpCodeSimdIns64)); SetA64("0x101110<<1xxxxx110000xxxxxxxxxx", InstEmit.Umull_V, typeof(OpCodeSimdReg64)); + SetA64("0x101111xxxxxxxx1010x0xxxxxxxxxx", InstEmit.Umull_Ve, typeof(OpCodeSimdRegElem64)); SetA64("01111110xx1xxxxx000011xxxxxxxxxx", InstEmit.Uqadd_S, typeof(OpCodeSimdReg64)); SetA64("0>101110<<1xxxxx000011xxxxxxxxxx", InstEmit.Uqadd_V, typeof(OpCodeSimdReg64)); SetA64("0>101110<<1xxxxx010111xxxxxxxxxx", InstEmit.Uqrshl_V, typeof(OpCodeSimdReg64)); diff --git a/Ryujinx.Tests/Cpu/CpuTestSimdRegElem.cs b/Ryujinx.Tests/Cpu/CpuTestSimdRegElem.cs index d97bd7b0..7fc593a8 100644 --- a/Ryujinx.Tests/Cpu/CpuTestSimdRegElem.cs +++ b/Ryujinx.Tests/Cpu/CpuTestSimdRegElem.cs @@ -45,6 +45,32 @@ namespace Ryujinx.Tests.Cpu 0x0F808000u // MUL V0.2S, V0.2S, V0.S[0] }; } + + private static uint[] _SU_Mlal_Mlsl_Mull_Ve_4H4S_8H4S_() + { + return new uint[] + { + 0x0F402000u, // SMLAL V0.4S, V0.4H, V0.H[0] + 0x0F406000u, // SMLSL V0.4S, V0.4H, V0.H[0] + 0x0F40A000u, // SMULL V0.4S, V0.4H, V0.H[0] + 0x2F402000u, // UMLAL V0.4S, V0.4H, V0.H[0] + 0x2F406000u, // UMLSL V0.4S, V0.4H, V0.H[0] + 0x2F40A000u // UMULL V0.4S, V0.4H, V0.H[0] + }; + } + + private static uint[] _SU_Mlal_Mlsl_Mull_Ve_2S2D_4S2D_() + { + return new uint[] + { + 0x0F802000u, // SMLAL V0.2D, V0.2S, V0.S[0] + 0x0F806000u, // SMLSL V0.2D, V0.2S, V0.S[0] + 0x0F80A000u, // SMULL V0.2D, V0.2S, V0.S[0] + 0x2F802000u, // UMLAL V0.2D, V0.2S, V0.S[0] + 0x2F806000u, // UMLSL V0.2D, V0.2S, V0.S[0] + 0x2F80A000u // UMULL V0.2D, V0.2S, V0.S[0] + }; + } #endregion private const int RndCnt = 2; @@ -103,6 +129,61 @@ namespace Ryujinx.Tests.Cpu CompareAgainstUnicorn(); } + + [Test, Pairwise] + public void SU_Mlal_Mlsl_Mull_Ve_4H4S_8H4S([ValueSource("_SU_Mlal_Mlsl_Mull_Ve_4H4S_8H4S_")] uint opcodes, + [Values(0u)] uint rd, + [Values(1u, 0u)] uint rn, + [Values(2u, 0u)] uint rm, + [ValueSource("_4H_")] [Random(RndCnt)] ulong z, + [ValueSource("_4H_")] [Random(RndCnt)] ulong a, + [ValueSource("_4H_")] [Random(RndCnt)] ulong b, + [Values(0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u)] uint index, + [Values(0b0u, 0b1u)] uint q) // <4H4S, 8H4S> + { + uint h = (index >> 2) & 1; + uint l = (index >> 1) & 1; + uint m = index & 1; + + opcodes |= ((rm & 15) << 16) | ((rn & 31) << 5) | ((rd & 31) << 0); + opcodes |= (l << 21) | (m << 20) | (h << 11); + opcodes |= ((q & 1) << 30); + + Vector128 v0 = MakeVectorE0E1(z, z); + Vector128 v1 = MakeVectorE0E1(q == 0u ? a : 0ul, q == 1u ? a : 0ul); + Vector128 v2 = MakeVectorE0E1(b, b * h); + + SingleOpcode(opcodes, v0: v0, v1: v1, v2: v2); + + CompareAgainstUnicorn(); + } + + [Test, Pairwise] + public void SU_Mlal_Mlsl_Mull_Ve_2S2D_4S2D([ValueSource("_SU_Mlal_Mlsl_Mull_Ve_2S2D_4S2D_")] uint opcodes, + [Values(0u)] uint rd, + [Values(1u, 0u)] uint rn, + [Values(2u, 0u)] uint rm, + [ValueSource("_2S_")] [Random(RndCnt)] ulong z, + [ValueSource("_2S_")] [Random(RndCnt)] ulong a, + [ValueSource("_2S_")] [Random(RndCnt)] ulong b, + [Values(0u, 1u, 2u, 3u)] uint index, + [Values(0b0u, 0b1u)] uint q) // <2S2D, 4S2D> + { + uint h = (index >> 1) & 1; + uint l = index & 1; + + opcodes |= ((rm & 15) << 16) | ((rn & 31) << 5) | ((rd & 31) << 0); + opcodes |= (l << 21) | (h << 11); + opcodes |= ((q & 1) << 30); + + Vector128 v0 = MakeVectorE0E1(z, z); + Vector128 v1 = MakeVectorE0E1(q == 0u ? a : 0ul, q == 1u ? a : 0ul); + Vector128 v2 = MakeVectorE0E1(b, b * h); + + SingleOpcode(opcodes, v0: v0, v1: v1, v2: v2); + + CompareAgainstUnicorn(); + } #endif } }