diff --git a/ChocolArm64/Decoders/OpCodeSimdFcond64.cs b/ChocolArm64/Decoders/OpCodeSimdFcond64.cs index b0f1c0eb..f805b3c1 100644 --- a/ChocolArm64/Decoders/OpCodeSimdFcond64.cs +++ b/ChocolArm64/Decoders/OpCodeSimdFcond64.cs @@ -10,8 +10,8 @@ namespace ChocolArm64.Decoders public OpCodeSimdFcond64(Inst inst, long position, int opCode) : base(inst, position, opCode) { - Nzcv = (opCode >> 0) & 0xf; + Nzcv = (opCode >> 0) & 0xf; Cond = (Cond)((opCode >> 12) & 0xf); } } -} \ No newline at end of file +} diff --git a/ChocolArm64/Instructions/InstEmitSimdArithmetic.cs b/ChocolArm64/Instructions/InstEmitSimdArithmetic.cs index c05e9f94..df84596b 100644 --- a/ChocolArm64/Instructions/InstEmitSimdArithmetic.cs +++ b/ChocolArm64/Instructions/InstEmitSimdArithmetic.cs @@ -1638,7 +1638,34 @@ namespace ChocolArm64.Instructions public static void Neg_V(ILEmitterCtx context) { - EmitVectorUnaryOpSx(context, () => context.Emit(OpCodes.Neg)); + if (Optimizations.UseSse2) + { + OpCodeSimd64 op = (OpCodeSimd64)context.CurrOp; + + Type[] typesSub = new Type[] { VectorIntTypesPerSizeLog2[op.Size], VectorIntTypesPerSizeLog2[op.Size] }; + + string[] namesSzv = new string[] { nameof(VectorHelper.VectorSByteZero), + nameof(VectorHelper.VectorInt16Zero), + nameof(VectorHelper.VectorInt32Zero), + nameof(VectorHelper.VectorInt64Zero) }; + + VectorHelper.EmitCall(context, namesSzv[op.Size]); + + EmitLdvecWithSignedCast(context, op.Rn, op.Size); + + context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Subtract), typesSub)); + + EmitStvecWithSignedCast(context, op.Rd, op.Size); + + if (op.RegisterSize == RegisterSize.Simd64) + { + EmitVectorZeroUpper(context, op.Rd); + } + } + else + { + EmitVectorUnaryOpSx(context, () => context.Emit(OpCodes.Neg)); + } } public static void Raddhn_V(ILEmitterCtx context) diff --git a/ChocolArm64/Instructions/InstEmitSimdLogical.cs b/ChocolArm64/Instructions/InstEmitSimdLogical.cs index f51568eb..3473fc5d 100644 --- a/ChocolArm64/Instructions/InstEmitSimdLogical.cs +++ b/ChocolArm64/Instructions/InstEmitSimdLogical.cs @@ -3,6 +3,7 @@ using ChocolArm64.State; using ChocolArm64.Translation; using System; using System.Reflection.Emit; +using System.Runtime.Intrinsics; using System.Runtime.Intrinsics.X86; using static ChocolArm64.Instructions.InstEmitSimdHelper; @@ -29,18 +30,14 @@ namespace ChocolArm64.Instructions { OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp; - EmitLdvecWithUnsignedCast(context, op.Rm, op.Size); - EmitLdvecWithUnsignedCast(context, op.Rn, op.Size); + Type[] typesAndNot = new Type[] { typeof(Vector128), typeof(Vector128) }; - Type[] types = new Type[] - { - VectorUIntTypesPerSizeLog2[op.Size], - VectorUIntTypesPerSizeLog2[op.Size] - }; + EmitLdvecWithUnsignedCast(context, op.Rm, 0); + EmitLdvecWithUnsignedCast(context, op.Rn, 0); - context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.AndNot), types)); + context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.AndNot), typesAndNot)); - EmitStvecWithUnsignedCast(context, op.Rd, op.Size); + EmitStvecWithUnsignedCast(context, op.Rd, 0); if (op.RegisterSize == RegisterSize.Simd64) { @@ -68,41 +65,34 @@ namespace ChocolArm64.Instructions public static void Bif_V(ILEmitterCtx context) { - EmitBitBif(context, true); + EmitBifBit(context, notRm: true); } public static void Bit_V(ILEmitterCtx context) { - EmitBitBif(context, false); + EmitBifBit(context, notRm: false); } - private static void EmitBitBif(ILEmitterCtx context, bool notRm) + private static void EmitBifBit(ILEmitterCtx context, bool notRm) { OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp; if (Optimizations.UseSse2) { - Type[] types = new Type[] - { - VectorUIntTypesPerSizeLog2[op.Size], - VectorUIntTypesPerSizeLog2[op.Size] - }; + Type[] typesXorAndNot = new Type[] { typeof(Vector128), typeof(Vector128) }; - EmitLdvecWithUnsignedCast(context, op.Rm, op.Size); - EmitLdvecWithUnsignedCast(context, op.Rd, op.Size); - EmitLdvecWithUnsignedCast(context, op.Rn, op.Size); + string nameAndNot = notRm ? nameof(Sse2.AndNot) : nameof(Sse2.And); - context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Xor), types)); + EmitLdvecWithUnsignedCast(context, op.Rd, 0); + EmitLdvecWithUnsignedCast(context, op.Rm, 0); + EmitLdvecWithUnsignedCast(context, op.Rn, 0); + EmitLdvecWithUnsignedCast(context, op.Rd, 0); - string name = notRm ? nameof(Sse2.AndNot) : nameof(Sse2.And); + context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Xor), typesXorAndNot)); + context.EmitCall(typeof(Sse2).GetMethod(nameAndNot, typesXorAndNot)); + context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Xor), typesXorAndNot)); - context.EmitCall(typeof(Sse2).GetMethod(name, types)); - - EmitLdvecWithUnsignedCast(context, op.Rd, op.Size); - - context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Xor), types)); - - EmitStvecWithUnsignedCast(context, op.Rd, op.Size); + EmitStvecWithUnsignedCast(context, op.Rd, 0); if (op.RegisterSize == RegisterSize.Simd64) { @@ -111,17 +101,18 @@ namespace ChocolArm64.Instructions } else { - int bytes = op.GetBitsCount() >> 3; - int elems = bytes >> op.Size; + int elems = op.RegisterSize == RegisterSize.Simd128 ? 2 : 1; for (int index = 0; index < elems; index++) { - EmitVectorExtractZx(context, op.Rd, index, op.Size); - EmitVectorExtractZx(context, op.Rn, index, op.Size); + EmitVectorExtractZx(context, op.Rd, index, 3); + context.Emit(OpCodes.Dup); + + EmitVectorExtractZx(context, op.Rn, index, 3); context.Emit(OpCodes.Xor); - EmitVectorExtractZx(context, op.Rm, index, op.Size); + EmitVectorExtractZx(context, op.Rm, index, 3); if (notRm) { @@ -130,11 +121,9 @@ namespace ChocolArm64.Instructions context.Emit(OpCodes.And); - EmitVectorExtractZx(context, op.Rd, index, op.Size); - context.Emit(OpCodes.Xor); - EmitVectorInsert(context, op.Rd, index, op.Size); + EmitVectorInsert(context, op.Rd, index, 3); } if (op.RegisterSize == RegisterSize.Simd64) @@ -150,26 +139,22 @@ namespace ChocolArm64.Instructions { OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp; - Type[] types = new Type[] - { - VectorUIntTypesPerSizeLog2[op.Size], - VectorUIntTypesPerSizeLog2[op.Size] - }; + Type[] typesXorAnd = new Type[] { typeof(Vector128), typeof(Vector128) }; - EmitLdvecWithUnsignedCast(context, op.Rn, op.Size); - EmitLdvecWithUnsignedCast(context, op.Rm, op.Size); + EmitLdvecWithUnsignedCast(context, op.Rm, 0); + context.Emit(OpCodes.Dup); - context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Xor), types)); + EmitLdvecWithUnsignedCast(context, op.Rn, 0); - EmitLdvecWithUnsignedCast(context, op.Rd, op.Size); + context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Xor), typesXorAnd)); - context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.And), types)); + EmitLdvecWithUnsignedCast(context, op.Rd, 0); - EmitLdvecWithUnsignedCast(context, op.Rm, op.Size); + context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.And), typesXorAnd)); - context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Xor), types)); + context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Xor), typesXorAnd)); - EmitStvecWithUnsignedCast(context, op.Rd, op.Size); + EmitStvecWithUnsignedCast(context, op.Rd, 0); if (op.RegisterSize == RegisterSize.Simd64) { @@ -207,16 +192,66 @@ namespace ChocolArm64.Instructions public static void Not_V(ILEmitterCtx context) { - EmitVectorUnaryOpZx(context, () => context.Emit(OpCodes.Not)); + if (Optimizations.UseSse2) + { + OpCodeSimd64 op = (OpCodeSimd64)context.CurrOp; + + Type[] typesSav = new Type[] { typeof(byte) }; + Type[] typesAndNot = new Type[] { typeof(Vector128), typeof(Vector128) }; + + EmitLdvecWithUnsignedCast(context, op.Rn, 0); + + context.EmitLdc_I4(byte.MaxValue); + context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.SetAllVector128), typesSav)); + + context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.AndNot), typesAndNot)); + + EmitStvecWithUnsignedCast(context, op.Rd, 0); + + if (op.RegisterSize == RegisterSize.Simd64) + { + EmitVectorZeroUpper(context, op.Rd); + } + } + else + { + EmitVectorUnaryOpZx(context, () => context.Emit(OpCodes.Not)); + } } public static void Orn_V(ILEmitterCtx context) { - EmitVectorBinaryOpZx(context, () => + if (Optimizations.UseSse2) { - context.Emit(OpCodes.Not); - context.Emit(OpCodes.Or); - }); + OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp; + + Type[] typesSav = new Type[] { typeof(byte) }; + Type[] typesAndNotOr = new Type[] { typeof(Vector128), typeof(Vector128) }; + + EmitLdvecWithUnsignedCast(context, op.Rn, 0); + EmitLdvecWithUnsignedCast(context, op.Rm, 0); + + context.EmitLdc_I4(byte.MaxValue); + context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.SetAllVector128), typesSav)); + + context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.AndNot), typesAndNotOr)); + context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Or), typesAndNotOr)); + + EmitStvecWithUnsignedCast(context, op.Rd, 0); + + if (op.RegisterSize == RegisterSize.Simd64) + { + EmitVectorZeroUpper(context, op.Rd); + } + } + else + { + EmitVectorBinaryOpZx(context, () => + { + context.Emit(OpCodes.Not); + context.Emit(OpCodes.Or); + }); + } } public static void Orr_V(ILEmitterCtx context) @@ -263,28 +298,122 @@ namespace ChocolArm64.Instructions public static void Rev16_V(ILEmitterCtx context) { - EmitRev_V(context, containerSize: 1); + if (Optimizations.UseSsse3) + { + OpCodeSimd64 op = (OpCodeSimd64)context.CurrOp; + + Type[] typesSve = new Type[] { typeof(long), typeof(long) }; + Type[] typesSfl = new Type[] { typeof(Vector128), typeof(Vector128) }; + + EmitLdvecWithSignedCast(context, op.Rn, 0); // value + + context.EmitLdc_I8(14L << 56 | 15L << 48 | 12L << 40 | 13L << 32 | 10L << 24 | 11L << 16 | 08L << 8 | 09L << 0); // maskE1 + context.EmitLdc_I8(06L << 56 | 07L << 48 | 04L << 40 | 05L << 32 | 02L << 24 | 03L << 16 | 00L << 8 | 01L << 0); // maskE0 + + context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.SetVector128), typesSve)); + + context.EmitCall(typeof(Ssse3).GetMethod(nameof(Ssse3.Shuffle), typesSfl)); + + EmitStvecWithSignedCast(context, op.Rd, 0); + + if (op.RegisterSize == RegisterSize.Simd64) + { + EmitVectorZeroUpper(context, op.Rd); + } + } + else + { + EmitRev_V(context, containerSize: 1); + } } public static void Rev32_V(ILEmitterCtx context) { - EmitRev_V(context, containerSize: 2); + if (Optimizations.UseSsse3) + { + OpCodeSimd64 op = (OpCodeSimd64)context.CurrOp; + + Type[] typesSve = new Type[] { typeof(long), typeof(long) }; + Type[] typesSfl = new Type[] { typeof(Vector128), typeof(Vector128) }; + + EmitLdvecWithSignedCast(context, op.Rn, op.Size); // value + + if (op.Size == 0) + { + context.EmitLdc_I8(12L << 56 | 13L << 48 | 14L << 40 | 15L << 32 | 08L << 24 | 09L << 16 | 10L << 8 | 11L << 0); // maskE1 + context.EmitLdc_I8(04L << 56 | 05L << 48 | 06L << 40 | 07L << 32 | 00L << 24 | 01L << 16 | 02L << 8 | 03L << 0); // maskE0 + } + else /* if (op.Size == 1) */ + { + context.EmitLdc_I8(13L << 56 | 12L << 48 | 15L << 40 | 14L << 32 | 09L << 24 | 08L << 16 | 11L << 8 | 10L << 0); // maskE1 + context.EmitLdc_I8(05L << 56 | 04L << 48 | 07L << 40 | 06L << 32 | 01L << 24 | 00L << 16 | 03L << 8 | 02L << 0); // maskE0 + } + + context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.SetVector128), typesSve)); + + context.EmitCall(typeof(Ssse3).GetMethod(nameof(Ssse3.Shuffle), typesSfl)); + + EmitStvecWithSignedCast(context, op.Rd, op.Size); + + if (op.RegisterSize == RegisterSize.Simd64) + { + EmitVectorZeroUpper(context, op.Rd); + } + } + else + { + EmitRev_V(context, containerSize: 2); + } } public static void Rev64_V(ILEmitterCtx context) { - EmitRev_V(context, containerSize: 3); + if (Optimizations.UseSsse3) + { + OpCodeSimd64 op = (OpCodeSimd64)context.CurrOp; + + Type[] typesSve = new Type[] { typeof(long), typeof(long) }; + Type[] typesSfl = new Type[] { typeof(Vector128), typeof(Vector128) }; + + EmitLdvecWithSignedCast(context, op.Rn, op.Size); // value + + if (op.Size == 0) + { + context.EmitLdc_I8(08L << 56 | 09L << 48 | 10L << 40 | 11L << 32 | 12L << 24 | 13L << 16 | 14L << 8 | 15L << 0); // maskE1 + context.EmitLdc_I8(00L << 56 | 01L << 48 | 02L << 40 | 03L << 32 | 04L << 24 | 05L << 16 | 06L << 8 | 07L << 0); // maskE0 + } + else if (op.Size == 1) + { + context.EmitLdc_I8(09L << 56 | 08L << 48 | 11L << 40 | 10L << 32 | 13L << 24 | 12L << 16 | 15L << 8 | 14L << 0); // maskE1 + context.EmitLdc_I8(01L << 56 | 00L << 48 | 03L << 40 | 02L << 32 | 05L << 24 | 04L << 16 | 07L << 8 | 06L << 0); // maskE0 + } + else /* if (op.Size == 2) */ + { + context.EmitLdc_I8(11L << 56 | 10L << 48 | 09L << 40 | 08L << 32 | 15L << 24 | 14L << 16 | 13L << 8 | 12L << 0); // maskE1 + context.EmitLdc_I8(03L << 56 | 02L << 48 | 01L << 40 | 00L << 32 | 07L << 24 | 06L << 16 | 05L << 8 | 04L << 0); // maskE0 + } + + context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.SetVector128), typesSve)); + + context.EmitCall(typeof(Ssse3).GetMethod(nameof(Ssse3.Shuffle), typesSfl)); + + EmitStvecWithSignedCast(context, op.Rd, op.Size); + + if (op.RegisterSize == RegisterSize.Simd64) + { + EmitVectorZeroUpper(context, op.Rd); + } + } + else + { + EmitRev_V(context, containerSize: 3); + } } private static void EmitRev_V(ILEmitterCtx context, int containerSize) { OpCodeSimd64 op = (OpCodeSimd64)context.CurrOp; - if (op.Size >= containerSize) - { - throw new InvalidOperationException(); - } - int bytes = op.GetBitsCount() >> 3; int elems = bytes >> op.Size; diff --git a/ChocolArm64/Instructions/InstEmitSimdShift.cs b/ChocolArm64/Instructions/InstEmitSimdShift.cs index b183e8aa..5b606167 100644 --- a/ChocolArm64/Instructions/InstEmitSimdShift.cs +++ b/ChocolArm64/Instructions/InstEmitSimdShift.cs @@ -110,6 +110,34 @@ namespace ChocolArm64.Instructions } } + public static void Sqrshl_V(ILEmitterCtx context) + { + OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp; + + int bytes = op.GetBitsCount() >> 3; + int elems = bytes >> op.Size; + + for (int index = 0; index < elems; index++) + { + EmitVectorExtractSx(context, op.Rn, index, op.Size); + EmitVectorExtractSx(context, op.Rm, index, op.Size); + + context.Emit(OpCodes.Ldc_I4_1); + context.EmitLdc_I4(op.Size); + + context.EmitLdarg(TranslatedSub.StateArgIdx); + + SoftFallback.EmitCall(context, nameof(SoftFallback.SignedShlRegSatQ)); + + EmitVectorInsert(context, op.Rd, index, op.Size); + } + + if (op.RegisterSize == RegisterSize.Simd64) + { + EmitVectorZeroUpper(context, op.Rd); + } + } + public static void Sqrshrn_S(ILEmitterCtx context) { EmitRoundShrImmSaturatingNarrowOp(context, ShrImmSaturatingNarrowFlags.ScalarSxSx); @@ -130,6 +158,34 @@ namespace ChocolArm64.Instructions EmitRoundShrImmSaturatingNarrowOp(context, ShrImmSaturatingNarrowFlags.VectorSxZx); } + public static void Sqshl_V(ILEmitterCtx context) + { + OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp; + + int bytes = op.GetBitsCount() >> 3; + int elems = bytes >> op.Size; + + for (int index = 0; index < elems; index++) + { + EmitVectorExtractSx(context, op.Rn, index, op.Size); + EmitVectorExtractSx(context, op.Rm, index, op.Size); + + context.Emit(OpCodes.Ldc_I4_0); + context.EmitLdc_I4(op.Size); + + context.EmitLdarg(TranslatedSub.StateArgIdx); + + SoftFallback.EmitCall(context, nameof(SoftFallback.SignedShlRegSatQ)); + + EmitVectorInsert(context, op.Rd, index, op.Size); + } + + if (op.RegisterSize == RegisterSize.Simd64) + { + EmitVectorZeroUpper(context, op.Rd); + } + } + public static void Sqshrn_S(ILEmitterCtx context) { EmitShrImmSaturatingNarrowOp(context, ShrImmSaturatingNarrowFlags.ScalarSxSx); @@ -150,6 +206,32 @@ namespace ChocolArm64.Instructions EmitShrImmSaturatingNarrowOp(context, ShrImmSaturatingNarrowFlags.VectorSxZx); } + public static void Srshl_V(ILEmitterCtx context) + { + OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp; + + int bytes = op.GetBitsCount() >> 3; + int elems = bytes >> op.Size; + + for (int index = 0; index < elems; index++) + { + EmitVectorExtractSx(context, op.Rn, index, op.Size); + EmitVectorExtractSx(context, op.Rm, index, op.Size); + + context.Emit(OpCodes.Ldc_I4_1); + context.EmitLdc_I4(op.Size); + + SoftFallback.EmitCall(context, nameof(SoftFallback.SignedShlReg)); + + EmitVectorInsert(context, op.Rd, index, op.Size); + } + + if (op.RegisterSize == RegisterSize.Simd64) + { + EmitVectorZeroUpper(context, op.Rd); + } + } + public static void Srshr_S(ILEmitterCtx context) { EmitScalarShrImmOpSx(context, ShrImmFlags.Round); @@ -252,7 +334,28 @@ namespace ChocolArm64.Instructions public static void Sshl_V(ILEmitterCtx context) { - EmitVectorShl(context, signed: true); + OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp; + + int bytes = op.GetBitsCount() >> 3; + int elems = bytes >> op.Size; + + for (int index = 0; index < elems; index++) + { + EmitVectorExtractSx(context, op.Rn, index, op.Size); + EmitVectorExtractSx(context, op.Rm, index, op.Size); + + context.Emit(OpCodes.Ldc_I4_0); + context.EmitLdc_I4(op.Size); + + SoftFallback.EmitCall(context, nameof(SoftFallback.SignedShlReg)); + + EmitVectorInsert(context, op.Rd, index, op.Size); + } + + if (op.RegisterSize == RegisterSize.Simd64) + { + EmitVectorZeroUpper(context, op.Rd); + } } public static void Sshll_V(ILEmitterCtx context) @@ -330,6 +433,34 @@ namespace ChocolArm64.Instructions } } + public static void Uqrshl_V(ILEmitterCtx context) + { + OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp; + + int bytes = op.GetBitsCount() >> 3; + int elems = bytes >> op.Size; + + for (int index = 0; index < elems; index++) + { + EmitVectorExtractZx(context, op.Rn, index, op.Size); + EmitVectorExtractZx(context, op.Rm, index, op.Size); + + context.Emit(OpCodes.Ldc_I4_1); + context.EmitLdc_I4(op.Size); + + context.EmitLdarg(TranslatedSub.StateArgIdx); + + SoftFallback.EmitCall(context, nameof(SoftFallback.UnsignedShlRegSatQ)); + + EmitVectorInsert(context, op.Rd, index, op.Size); + } + + if (op.RegisterSize == RegisterSize.Simd64) + { + EmitVectorZeroUpper(context, op.Rd); + } + } + public static void Uqrshrn_S(ILEmitterCtx context) { EmitRoundShrImmSaturatingNarrowOp(context, ShrImmSaturatingNarrowFlags.ScalarZxZx); @@ -340,6 +471,34 @@ namespace ChocolArm64.Instructions EmitRoundShrImmSaturatingNarrowOp(context, ShrImmSaturatingNarrowFlags.VectorZxZx); } + public static void Uqshl_V(ILEmitterCtx context) + { + OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp; + + int bytes = op.GetBitsCount() >> 3; + int elems = bytes >> op.Size; + + for (int index = 0; index < elems; index++) + { + EmitVectorExtractZx(context, op.Rn, index, op.Size); + EmitVectorExtractZx(context, op.Rm, index, op.Size); + + context.Emit(OpCodes.Ldc_I4_0); + context.EmitLdc_I4(op.Size); + + context.EmitLdarg(TranslatedSub.StateArgIdx); + + SoftFallback.EmitCall(context, nameof(SoftFallback.UnsignedShlRegSatQ)); + + EmitVectorInsert(context, op.Rd, index, op.Size); + } + + if (op.RegisterSize == RegisterSize.Simd64) + { + EmitVectorZeroUpper(context, op.Rd); + } + } + public static void Uqshrn_S(ILEmitterCtx context) { EmitShrImmSaturatingNarrowOp(context, ShrImmSaturatingNarrowFlags.ScalarZxZx); @@ -350,6 +509,32 @@ namespace ChocolArm64.Instructions EmitShrImmSaturatingNarrowOp(context, ShrImmSaturatingNarrowFlags.VectorZxZx); } + public static void Urshl_V(ILEmitterCtx context) + { + OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp; + + int bytes = op.GetBitsCount() >> 3; + int elems = bytes >> op.Size; + + for (int index = 0; index < elems; index++) + { + EmitVectorExtractZx(context, op.Rn, index, op.Size); + EmitVectorExtractZx(context, op.Rm, index, op.Size); + + context.Emit(OpCodes.Ldc_I4_1); + context.EmitLdc_I4(op.Size); + + SoftFallback.EmitCall(context, nameof(SoftFallback.UnsignedShlReg)); + + EmitVectorInsert(context, op.Rd, index, op.Size); + } + + if (op.RegisterSize == RegisterSize.Simd64) + { + EmitVectorZeroUpper(context, op.Rd); + } + } + public static void Urshr_S(ILEmitterCtx context) { EmitScalarShrImmOpZx(context, ShrImmFlags.Round); @@ -450,7 +635,28 @@ namespace ChocolArm64.Instructions public static void Ushl_V(ILEmitterCtx context) { - EmitVectorShl(context, signed: false); + OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp; + + int bytes = op.GetBitsCount() >> 3; + int elems = bytes >> op.Size; + + for (int index = 0; index < elems; index++) + { + EmitVectorExtractZx(context, op.Rn, index, op.Size); + EmitVectorExtractZx(context, op.Rm, index, op.Size); + + context.Emit(OpCodes.Ldc_I4_0); + context.EmitLdc_I4(op.Size); + + SoftFallback.EmitCall(context, nameof(SoftFallback.UnsignedShlReg)); + + EmitVectorInsert(context, op.Rd, index, op.Size); + } + + if (op.RegisterSize == RegisterSize.Simd64) + { + EmitVectorZeroUpper(context, op.Rd); + } } public static void Ushll_V(ILEmitterCtx context) @@ -526,69 +732,6 @@ namespace ChocolArm64.Instructions } } - private static void EmitVectorShl(ILEmitterCtx context, bool signed) - { - //This instruction shifts the value on vector A by the number of bits - //specified on the signed, lower 8 bits of vector B. If the shift value - //is greater or equal to the data size of each lane, then the result is zero. - //Additionally, negative shifts produces right shifts by the negated shift value. - OpCodeSimd64 op = (OpCodeSimd64)context.CurrOp; - - int maxShift = 8 << op.Size; - - Action emit = () => - { - ILLabel lblShl = new ILLabel(); - ILLabel lblZero = new ILLabel(); - ILLabel lblEnd = new ILLabel(); - - void EmitShift(OpCode ilOp) - { - context.Emit(OpCodes.Dup); - - context.EmitLdc_I4(maxShift); - - context.Emit(OpCodes.Bge_S, lblZero); - context.Emit(ilOp); - context.Emit(OpCodes.Br_S, lblEnd); - } - - context.Emit(OpCodes.Conv_I1); - context.Emit(OpCodes.Dup); - - context.EmitLdc_I4(0); - - context.Emit(OpCodes.Bge_S, lblShl); - context.Emit(OpCodes.Neg); - - EmitShift(signed - ? OpCodes.Shr - : OpCodes.Shr_Un); - - context.MarkLabel(lblShl); - - EmitShift(OpCodes.Shl); - - context.MarkLabel(lblZero); - - context.Emit(OpCodes.Pop); - context.Emit(OpCodes.Pop); - - context.EmitLdc_I8(0); - - context.MarkLabel(lblEnd); - }; - - if (signed) - { - EmitVectorBinaryOpSx(context, emit); - } - else - { - EmitVectorBinaryOpZx(context, emit); - } - } - [Flags] private enum ShrImmFlags { diff --git a/ChocolArm64/Instructions/SoftFallback.cs b/ChocolArm64/Instructions/SoftFallback.cs index 8315395a..def95343 100644 --- a/ChocolArm64/Instructions/SoftFallback.cs +++ b/ChocolArm64/Instructions/SoftFallback.cs @@ -16,6 +16,283 @@ namespace ChocolArm64.Instructions context.EmitCall(typeof(SoftFallback), mthdName); } +#region "ShlReg" + public static long SignedShlReg(long value, long shift, bool round, int size) + { + int eSize = 8 << size; + + int shiftLsB = (sbyte)shift; + + if (shiftLsB < 0) + { + return SignedShrReg(value, -shiftLsB, round, eSize); + } + else if (shiftLsB > 0) + { + if (shiftLsB >= eSize) + { + return 0L; + } + + return value << shiftLsB; + } + else /* if (shiftLsB == 0) */ + { + return value; + } + } + + public static ulong UnsignedShlReg(ulong value, ulong shift, bool round, int size) + { + int eSize = 8 << size; + + int shiftLsB = (sbyte)shift; + + if (shiftLsB < 0) + { + return UnsignedShrReg(value, -shiftLsB, round, eSize); + } + else if (shiftLsB > 0) + { + if (shiftLsB >= eSize) + { + return 0UL; + } + + return value << shiftLsB; + } + else /* if (shiftLsB == 0) */ + { + return value; + } + } + + public static long SignedShlRegSatQ(long value, long shift, bool round, int size, CpuThreadState state) + { + int eSize = 8 << size; + + int shiftLsB = (sbyte)shift; + + if (shiftLsB < 0) + { + return SignedShrReg(value, -shiftLsB, round, eSize); + } + else if (shiftLsB > 0) + { + if (shiftLsB >= eSize) + { + return SignedSignSatQ(value, eSize, state); + } + + if (eSize == 64) + { + long shl = value << shiftLsB; + long shr = shl >> shiftLsB; + + if (shr != value) + { + return SignedSignSatQ(value, eSize, state); + } + else /* if (shr == value) */ + { + return shl; + } + } + else /* if (eSize != 64) */ + { + return SignedSrcSignedDstSatQ(value << shiftLsB, size, state); + } + } + else /* if (shiftLsB == 0) */ + { + return value; + } + } + + public static ulong UnsignedShlRegSatQ(ulong value, ulong shift, bool round, int size, CpuThreadState state) + { + int eSize = 8 << size; + + int shiftLsB = (sbyte)shift; + + if (shiftLsB < 0) + { + return UnsignedShrReg(value, -shiftLsB, round, eSize); + } + else if (shiftLsB > 0) + { + if (shiftLsB >= eSize) + { + return UnsignedSignSatQ(value, eSize, state); + } + + if (eSize == 64) + { + ulong shl = value << shiftLsB; + ulong shr = shl >> shiftLsB; + + if (shr != value) + { + return UnsignedSignSatQ(value, eSize, state); + } + else /* if (shr == value) */ + { + return shl; + } + } + else /* if (eSize != 64) */ + { + return UnsignedSrcUnsignedDstSatQ(value << shiftLsB, size, state); + } + } + else /* if (shiftLsB == 0) */ + { + return value; + } + } + + private static long SignedShrReg(long value, int shift, bool round, int eSize) // shift := [1, 128]; eSize := {8, 16, 32, 64}. + { + if (round) + { + if (shift >= eSize) + { + return 0L; + } + + long roundConst = 1L << (shift - 1); + + long add = value + roundConst; + + if (eSize == 64) + { + if ((~value & (value ^ add)) < 0L) + { + return (long)((ulong)add >> shift); + } + else + { + return add >> shift; + } + } + else /* if (eSize != 64) */ + { + return add >> shift; + } + } + else /* if (!round) */ + { + if (shift >= eSize) + { + if (value < 0L) + { + return -1L; + } + else /* if (value >= 0L) */ + { + return 0L; + } + } + + return value >> shift; + } + } + + private static ulong UnsignedShrReg(ulong value, int shift, bool round, int eSize) // shift := [1, 128]; eSize := {8, 16, 32, 64}. + { + if (round) + { + if (shift > 64) + { + return 0UL; + } + + ulong roundConst = 1UL << (shift - 1); + + ulong add = value + roundConst; + + if (eSize == 64) + { + if ((add < value) && (add < roundConst)) + { + if (shift == 64) + { + return 1UL; + } + + return (add >> shift) | (0x8000000000000000UL >> (shift - 1)); + } + else + { + if (shift == 64) + { + return 0UL; + } + + return add >> shift; + } + } + else /* if (eSize != 64) */ + { + if (shift == 64) + { + return 0UL; + } + + return add >> shift; + } + } + else /* if (!round) */ + { + if (shift >= eSize) + { + return 0UL; + } + + return value >> shift; + } + } + + private static long SignedSignSatQ(long op, int eSize, CpuThreadState state) // eSize := {8, 16, 32, 64}. + { + long tMaxValue = (1L << (eSize - 1)) - 1L; + long tMinValue = -(1L << (eSize - 1)); + + if (op > 0L) + { + state.SetFpsrFlag(Fpsr.Qc); + + return tMaxValue; + } + else if (op < 0L) + { + state.SetFpsrFlag(Fpsr.Qc); + + return tMinValue; + } + else + { + return 0L; + } + } + + private static ulong UnsignedSignSatQ(ulong op, int eSize, CpuThreadState state) // eSize := {8, 16, 32, 64}. + { + ulong tMaxValue = ulong.MaxValue >> (64 - eSize); + + if (op > 0UL) + { + state.SetFpsrFlag(Fpsr.Qc); + + return tMaxValue; + } + else + { + return 0UL; + } + } +#endregion + #region "ShrImm64" public static long SignedShrImm64(long value, long roundConst, int shift) { @@ -31,7 +308,7 @@ namespace ChocolArm64.Instructions { return -1L; } - else + else /* if (value >= 0L) */ { return 0L; } diff --git a/ChocolArm64/OpCodeTable.cs b/ChocolArm64/OpCodeTable.cs index 8151718f..9b9b993a 100644 --- a/ChocolArm64/OpCodeTable.cs +++ b/ChocolArm64/OpCodeTable.cs @@ -427,10 +427,12 @@ namespace ChocolArm64 SetA64("01111110101xxxxx101101xxxxxxxxxx", InstEmit.Sqrdmulh_S, typeof(OpCodeSimdReg64)); SetA64("0x101110011xxxxx101101xxxxxxxxxx", InstEmit.Sqrdmulh_V, typeof(OpCodeSimdReg64)); SetA64("0x101110101xxxxx101101xxxxxxxxxx", InstEmit.Sqrdmulh_V, typeof(OpCodeSimdReg64)); + SetA64("0>001110<<1xxxxx010111xxxxxxxxxx", InstEmit.Sqrshl_V, typeof(OpCodeSimdReg64)); SetA64("0101111100>>>xxx100111xxxxxxxxxx", InstEmit.Sqrshrn_S, typeof(OpCodeSimdShImm64)); SetA64("0x00111100>>>xxx100111xxxxxxxxxx", InstEmit.Sqrshrn_V, typeof(OpCodeSimdShImm64)); SetA64("0111111100>>>xxx100011xxxxxxxxxx", InstEmit.Sqrshrun_S, typeof(OpCodeSimdShImm64)); SetA64("0x10111100>>>xxx100011xxxxxxxxxx", InstEmit.Sqrshrun_V, typeof(OpCodeSimdShImm64)); + SetA64("0>001110<<1xxxxx010011xxxxxxxxxx", InstEmit.Sqshl_V, typeof(OpCodeSimdReg64)); SetA64("0101111100>>>xxx100101xxxxxxxxxx", InstEmit.Sqshrn_S, typeof(OpCodeSimdShImm64)); SetA64("0x00111100>>>xxx100101xxxxxxxxxx", InstEmit.Sqshrn_V, typeof(OpCodeSimdShImm64)); SetA64("0111111100>>>xxx100001xxxxxxxxxx", InstEmit.Sqshrun_S, typeof(OpCodeSimdShImm64)); @@ -442,6 +444,7 @@ namespace ChocolArm64 SetA64("01111110<<100001001010xxxxxxxxxx", InstEmit.Sqxtun_S, typeof(OpCodeSimd64)); SetA64("0x101110<<100001001010xxxxxxxxxx", InstEmit.Sqxtun_V, typeof(OpCodeSimd64)); SetA64("0x001110<<1xxxxx000101xxxxxxxxxx", InstEmit.Srhadd_V, typeof(OpCodeSimdReg64)); + SetA64("0>001110<<1xxxxx010101xxxxxxxxxx", InstEmit.Srshl_V, typeof(OpCodeSimdReg64)); SetA64("0101111101xxxxxx001001xxxxxxxxxx", InstEmit.Srshr_S, typeof(OpCodeSimdShImm64)); SetA64("0x00111100>>>xxx001001xxxxxxxxxx", InstEmit.Srshr_V, typeof(OpCodeSimdShImm64)); SetA64("0100111101xxxxxx001001xxxxxxxxxx", InstEmit.Srshr_V, typeof(OpCodeSimdShImm64)); @@ -501,8 +504,10 @@ namespace ChocolArm64 SetA64("0x101110<<1xxxxx110000xxxxxxxxxx", InstEmit.Umull_V, typeof(OpCodeSimdReg64)); SetA64("01111110xx1xxxxx000011xxxxxxxxxx", InstEmit.Uqadd_S, typeof(OpCodeSimdReg64)); SetA64("0>101110<<1xxxxx000011xxxxxxxxxx", InstEmit.Uqadd_V, typeof(OpCodeSimdReg64)); + SetA64("0>101110<<1xxxxx010111xxxxxxxxxx", InstEmit.Uqrshl_V, typeof(OpCodeSimdReg64)); SetA64("0111111100>>>xxx100111xxxxxxxxxx", InstEmit.Uqrshrn_S, typeof(OpCodeSimdShImm64)); SetA64("0x10111100>>>xxx100111xxxxxxxxxx", InstEmit.Uqrshrn_V, typeof(OpCodeSimdShImm64)); + SetA64("0>101110<<1xxxxx010011xxxxxxxxxx", InstEmit.Uqshl_V, typeof(OpCodeSimdReg64)); SetA64("0111111100>>>xxx100101xxxxxxxxxx", InstEmit.Uqshrn_S, typeof(OpCodeSimdShImm64)); SetA64("0x10111100>>>xxx100101xxxxxxxxxx", InstEmit.Uqshrn_V, typeof(OpCodeSimdShImm64)); SetA64("01111110xx1xxxxx001011xxxxxxxxxx", InstEmit.Uqsub_S, typeof(OpCodeSimdReg64)); @@ -510,6 +515,7 @@ namespace ChocolArm64 SetA64("01111110<<100001010010xxxxxxxxxx", InstEmit.Uqxtn_S, typeof(OpCodeSimd64)); SetA64("0x101110<<100001010010xxxxxxxxxx", InstEmit.Uqxtn_V, typeof(OpCodeSimd64)); SetA64("0x101110<<1xxxxx000101xxxxxxxxxx", InstEmit.Urhadd_V, typeof(OpCodeSimdReg64)); + SetA64("0>101110<<1xxxxx010101xxxxxxxxxx", InstEmit.Urshl_V, typeof(OpCodeSimdReg64)); SetA64("0111111101xxxxxx001001xxxxxxxxxx", InstEmit.Urshr_S, typeof(OpCodeSimdShImm64)); SetA64("0x10111100>>>xxx001001xxxxxxxxxx", InstEmit.Urshr_V, typeof(OpCodeSimdShImm64)); SetA64("0110111101xxxxxx001001xxxxxxxxxx", InstEmit.Urshr_V, typeof(OpCodeSimdShImm64)); diff --git a/ChocolArm64/Optimizations.cs b/ChocolArm64/Optimizations.cs index f2b0ffba..aab5eca7 100644 --- a/ChocolArm64/Optimizations.cs +++ b/ChocolArm64/Optimizations.cs @@ -8,11 +8,13 @@ public static class Optimizations private static bool _useSseIfAvailable = true; private static bool _useSse2IfAvailable = true; + private static bool _useSsse3IfAvailable = true; private static bool _useSse41IfAvailable = true; private static bool _useSse42IfAvailable = true; internal static bool UseSse = (_useAllSseIfAvailable && _useSseIfAvailable) && Sse.IsSupported; internal static bool UseSse2 = (_useAllSseIfAvailable && _useSse2IfAvailable) && Sse2.IsSupported; + internal static bool UseSsse3 = (_useAllSseIfAvailable && _useSsse3IfAvailable) && Ssse3.IsSupported; internal static bool UseSse41 = (_useAllSseIfAvailable && _useSse41IfAvailable) && Sse41.IsSupported; internal static bool UseSse42 = (_useAllSseIfAvailable && _useSse42IfAvailable) && Sse42.IsSupported; } diff --git a/Ryujinx.Tests/Cpu/CpuTestSimdReg.cs b/Ryujinx.Tests/Cpu/CpuTestSimdReg.cs index d43447a7..cceb8b10 100644 --- a/Ryujinx.Tests/Cpu/CpuTestSimdReg.cs +++ b/Ryujinx.Tests/Cpu/CpuTestSimdReg.cs @@ -420,6 +420,36 @@ namespace Ryujinx.Tests.Cpu }; } + private static uint[] _ShlReg_V_8B_4H_2S_() + { + return new uint[] + { + 0x0E205C00u, // SQRSHL V0.8B, V0.8B, V0.8B + 0x0E204C00u, // SQSHL V0.8B, V0.8B, V0.8B + 0x0E205400u, // SRSHL V0.8B, V0.8B, V0.8B + 0x0E204400u, // SSHL V0.8B, V0.8B, V0.8B + 0x2E205C00u, // UQRSHL V0.8B, V0.8B, V0.8B + 0x2E204C00u, // UQSHL V0.8B, V0.8B, V0.8B + 0x2E205400u, // URSHL V0.8B, V0.8B, V0.8B + 0x2E204400u // USHL V0.8B, V0.8B, V0.8B + }; + } + + private static uint[] _ShlReg_V_16B_8H_4S_2D_() + { + return new uint[] + { + 0x4E205C00u, // SQRSHL V0.16B, V0.16B, V0.16B + 0x4E204C00u, // SQSHL V0.16B, V0.16B, V0.16B + 0x4E205400u, // SRSHL V0.16B, V0.16B, V0.16B + 0x4E204400u, // SSHL V0.16B, V0.16B, V0.16B + 0x6E205C00u, // UQRSHL V0.16B, V0.16B, V0.16B + 0x6E204C00u, // UQSHL V0.16B, V0.16B, V0.16B + 0x6E205400u, // URSHL V0.16B, V0.16B, V0.16B + 0x6E204400u // USHL V0.16B, V0.16B, V0.16B + }; + } + private static uint[] _U_Max_Min_P_V_() { return new uint[] @@ -2602,6 +2632,50 @@ namespace Ryujinx.Tests.Cpu CompareAgainstUnicorn(); } + [Test, Pairwise] + public void ShlReg_V_8B_4H_2S([ValueSource("_ShlReg_V_8B_4H_2S_")] uint opcodes, + [Values(0u)] uint rd, + [Values(1u, 0u)] uint rn, + [Values(2u, 0u)] uint rm, + [ValueSource("_8B4H2S_")] [Random(RndCnt)] ulong z, + [ValueSource("_8B4H2S_")] [Random(RndCnt)] ulong a, + [ValueSource("_8B4H2S_")] [Random(0ul, 255ul, RndCnt)] ulong b, + [Values(0b00u, 0b01u, 0b10u)] uint size) // <8B, 4H, 2S> + { + opcodes |= ((rm & 31) << 16) | ((rn & 31) << 5) | ((rd & 31) << 0); + opcodes |= ((size & 3) << 22); + + Vector128 v0 = MakeVectorE0E1(z, z); + Vector128 v1 = MakeVectorE0(a); + Vector128 v2 = MakeVectorE0(b); + + SingleOpcode(opcodes, v0: v0, v1: v1, v2: v2); + + CompareAgainstUnicorn(fpsrMask: Fpsr.Qc); + } + + [Test, Pairwise] + public void ShlReg_V_16B_8H_4S_2D([ValueSource("_ShlReg_V_16B_8H_4S_2D_")] uint opcodes, + [Values(0u)] uint rd, + [Values(1u, 0u)] uint rn, + [Values(2u, 0u)] uint rm, + [ValueSource("_8B4H2S1D_")] [Random(RndCnt)] ulong z, + [ValueSource("_8B4H2S1D_")] [Random(RndCnt)] ulong a, + [ValueSource("_8B4H2S1D_")] [Random(0ul, 255ul, RndCnt)] ulong b, + [Values(0b00u, 0b01u, 0b10u, 0b11u)] uint size) // <16B, 8H, 4S, 2D> + { + opcodes |= ((rm & 31) << 16) | ((rn & 31) << 5) | ((rd & 31) << 0); + opcodes |= ((size & 3) << 22); + + Vector128 v0 = MakeVectorE0E1(z, z); + Vector128 v1 = MakeVectorE0E1(a, a); + Vector128 v2 = MakeVectorE0E1(b, b); + + SingleOpcode(opcodes, v0: v0, v1: v1, v2: v2); + + CompareAgainstUnicorn(fpsrMask: Fpsr.Qc); + } + [Test, Pairwise, Description("SSUBL{2} ., ., .")] public void Ssubl_V_8B8H_4H4S_2S2D([Values(0u)] uint rd, [Values(1u, 0u)] uint rn,