mirror of
https://github.com/GreemDev/Ryujinx.git
synced 2025-01-25 16:22:00 +00:00
b8eb6abecc
* Refactor shader GPU state and memory access * Fix NVDEC project build * Address PR feedback and add missing XML comments
620 lines
No EOL
19 KiB
C#
620 lines
No EOL
19 KiB
C#
using Ryujinx.Graphics.Shader.Decoders;
|
|
using Ryujinx.Graphics.Shader.IntermediateRepresentation;
|
|
using Ryujinx.Graphics.Shader.Translation;
|
|
|
|
using static Ryujinx.Graphics.Shader.Instructions.InstEmitHelper;
|
|
using static Ryujinx.Graphics.Shader.IntermediateRepresentation.OperandHelper;
|
|
|
|
namespace Ryujinx.Graphics.Shader.Instructions
|
|
{
|
|
static partial class InstEmit
|
|
{
|
|
private enum MemoryRegion
|
|
{
|
|
Local,
|
|
Shared
|
|
}
|
|
|
|
public static void Ald(EmitterContext context)
|
|
{
|
|
OpCodeAttribute op = (OpCodeAttribute)context.CurrOp;
|
|
|
|
Operand primVertex = context.Copy(GetSrcC(context));
|
|
|
|
for (int index = 0; index < op.Count; index++)
|
|
{
|
|
Register rd = new Register(op.Rd.Index + index, RegisterType.Gpr);
|
|
|
|
if (rd.IsRZ)
|
|
{
|
|
break;
|
|
}
|
|
|
|
Operand src = Attribute(op.AttributeOffset + index * 4);
|
|
|
|
context.Copy(Register(rd), context.LoadAttribute(src, primVertex));
|
|
}
|
|
}
|
|
|
|
public static void Ast(EmitterContext context)
|
|
{
|
|
OpCodeAttribute op = (OpCodeAttribute)context.CurrOp;
|
|
|
|
for (int index = 0; index < op.Count; index++)
|
|
{
|
|
if (op.Rd.Index + index > RegisterConsts.RegisterZeroIndex)
|
|
{
|
|
break;
|
|
}
|
|
|
|
Register rd = new Register(op.Rd.Index + index, RegisterType.Gpr);
|
|
|
|
Operand dest = Attribute(op.AttributeOffset + index * 4);
|
|
|
|
context.Copy(dest, Register(rd));
|
|
}
|
|
}
|
|
|
|
public static void Atoms(EmitterContext context)
|
|
{
|
|
OpCodeAtom op = (OpCodeAtom)context.CurrOp;
|
|
|
|
Operand offset = context.ShiftRightU32(GetSrcA(context), Const(2));
|
|
|
|
offset = context.IAdd(offset, Const(op.Offset));
|
|
|
|
Operand value = GetSrcB(context);
|
|
|
|
Operand res = EmitAtomicOp(
|
|
context,
|
|
Instruction.MrShared,
|
|
op.AtomicOp,
|
|
op.Type,
|
|
offset,
|
|
Const(0),
|
|
value);
|
|
|
|
context.Copy(GetDest(context), res);
|
|
}
|
|
|
|
public static void Bar(EmitterContext context)
|
|
{
|
|
OpCodeBarrier op = (OpCodeBarrier)context.CurrOp;
|
|
|
|
// TODO: Support other modes.
|
|
if (op.Mode == BarrierMode.Sync)
|
|
{
|
|
context.Barrier();
|
|
}
|
|
else
|
|
{
|
|
context.Config.GpuAccessor.Log($"Invalid barrier mode: {op.Mode}.");
|
|
}
|
|
}
|
|
|
|
public static void Ipa(EmitterContext context)
|
|
{
|
|
OpCodeIpa op = (OpCodeIpa)context.CurrOp;
|
|
|
|
Operand res = Attribute(op.AttributeOffset);
|
|
|
|
if (op.AttributeOffset >= AttributeConsts.UserAttributeBase &&
|
|
op.AttributeOffset < AttributeConsts.UserAttributeEnd)
|
|
{
|
|
int index = (op.AttributeOffset - AttributeConsts.UserAttributeBase) >> 4;
|
|
|
|
if (context.Config.ImapTypes[index].GetFirstUsedType() == PixelImap.Perspective)
|
|
{
|
|
res = context.FPMultiply(res, Attribute(AttributeConsts.PositionW));
|
|
}
|
|
}
|
|
|
|
if (op.Mode == InterpolationMode.Default)
|
|
{
|
|
Operand srcB = GetSrcB(context);
|
|
|
|
res = context.FPMultiply(res, srcB);
|
|
}
|
|
|
|
res = context.FPSaturate(res, op.Saturate);
|
|
|
|
context.Copy(GetDest(context), res);
|
|
}
|
|
|
|
public static void Isberd(EmitterContext context)
|
|
{
|
|
// This instruction performs a load from ISBE memory,
|
|
// however it seems to be only used to get some vertex
|
|
// input data, so we instead propagate the offset so that
|
|
// it can be used on the attribute load.
|
|
context.Copy(GetDest(context), GetSrcA(context));
|
|
}
|
|
|
|
public static void Ld(EmitterContext context)
|
|
{
|
|
EmitLoad(context, MemoryRegion.Local);
|
|
}
|
|
|
|
public static void Ldc(EmitterContext context)
|
|
{
|
|
OpCodeLdc op = (OpCodeLdc)context.CurrOp;
|
|
|
|
if (op.Size > IntegerSize.B64)
|
|
{
|
|
context.Config.GpuAccessor.Log($"Invalid LDC size: {op.Size}.");
|
|
}
|
|
|
|
bool isSmallInt = op.Size < IntegerSize.B32;
|
|
|
|
int count = op.Size == IntegerSize.B64 ? 2 : 1;
|
|
|
|
Operand addr = context.IAdd(GetSrcA(context), Const(op.Offset));
|
|
|
|
Operand wordOffset = context.ShiftRightU32(addr, Const(2));
|
|
|
|
Operand bitOffset = GetBitOffset(context, addr);
|
|
|
|
for (int index = 0; index < count; index++)
|
|
{
|
|
Register rd = new Register(op.Rd.Index + index, RegisterType.Gpr);
|
|
|
|
if (rd.IsRZ)
|
|
{
|
|
break;
|
|
}
|
|
|
|
Operand offset = context.IAdd(wordOffset, Const(index));
|
|
|
|
Operand value = context.LoadConstant(Const(op.Slot), offset);
|
|
|
|
if (isSmallInt)
|
|
{
|
|
value = ExtractSmallInt(context, op.Size, bitOffset, value);
|
|
}
|
|
|
|
context.Copy(Register(rd), value);
|
|
}
|
|
}
|
|
|
|
public static void Ldg(EmitterContext context)
|
|
{
|
|
EmitLoadGlobal(context);
|
|
}
|
|
|
|
public static void Lds(EmitterContext context)
|
|
{
|
|
EmitLoad(context, MemoryRegion.Shared);
|
|
}
|
|
|
|
public static void Membar(EmitterContext context)
|
|
{
|
|
OpCodeMemoryBarrier op = (OpCodeMemoryBarrier)context.CurrOp;
|
|
|
|
if (op.Level == BarrierLevel.Cta)
|
|
{
|
|
context.GroupMemoryBarrier();
|
|
}
|
|
else
|
|
{
|
|
context.MemoryBarrier();
|
|
}
|
|
}
|
|
|
|
public static void Out(EmitterContext context)
|
|
{
|
|
OpCode op = context.CurrOp;
|
|
|
|
bool emit = op.RawOpCode.Extract(39);
|
|
bool cut = op.RawOpCode.Extract(40);
|
|
|
|
if (!(emit || cut))
|
|
{
|
|
context.Config.GpuAccessor.Log("Invalid OUT encoding.");
|
|
}
|
|
|
|
if (emit)
|
|
{
|
|
context.EmitVertex();
|
|
}
|
|
|
|
if (cut)
|
|
{
|
|
context.EndPrimitive();
|
|
}
|
|
}
|
|
|
|
public static void Red(EmitterContext context)
|
|
{
|
|
OpCodeRed op = (OpCodeRed)context.CurrOp;
|
|
|
|
(Operand addrLow, Operand addrHigh) = Get40BitsAddress(context, op.Ra, op.Extended, op.Offset);
|
|
|
|
EmitAtomicOp(
|
|
context,
|
|
Instruction.MrGlobal,
|
|
op.AtomicOp,
|
|
op.Type,
|
|
addrLow,
|
|
addrHigh,
|
|
GetDest(context));
|
|
}
|
|
|
|
public static void St(EmitterContext context)
|
|
{
|
|
EmitStore(context, MemoryRegion.Local);
|
|
}
|
|
|
|
public static void Stg(EmitterContext context)
|
|
{
|
|
EmitStoreGlobal(context);
|
|
}
|
|
|
|
public static void Sts(EmitterContext context)
|
|
{
|
|
EmitStore(context, MemoryRegion.Shared);
|
|
}
|
|
|
|
private static Operand EmitAtomicOp(
|
|
EmitterContext context,
|
|
Instruction mr,
|
|
AtomicOp op,
|
|
ReductionType type,
|
|
Operand addrLow,
|
|
Operand addrHigh,
|
|
Operand value)
|
|
{
|
|
Operand res = Const(0);
|
|
|
|
switch (op)
|
|
{
|
|
case AtomicOp.Add:
|
|
if (type == ReductionType.S32 || type == ReductionType.U32)
|
|
{
|
|
res = context.AtomicAdd(mr, addrLow, addrHigh, value);
|
|
}
|
|
else
|
|
{
|
|
context.Config.GpuAccessor.Log($"Invalid reduction type: {type}.");
|
|
}
|
|
break;
|
|
case AtomicOp.BitwiseAnd:
|
|
if (type == ReductionType.S32 || type == ReductionType.U32)
|
|
{
|
|
res = context.AtomicAnd(mr, addrLow, addrHigh, value);
|
|
}
|
|
else
|
|
{
|
|
context.Config.GpuAccessor.Log($"Invalid reduction type: {type}.");
|
|
}
|
|
break;
|
|
case AtomicOp.BitwiseExclusiveOr:
|
|
if (type == ReductionType.S32 || type == ReductionType.U32)
|
|
{
|
|
res = context.AtomicXor(mr, addrLow, addrHigh, value);
|
|
}
|
|
else
|
|
{
|
|
context.Config.GpuAccessor.Log($"Invalid reduction type: {type}.");
|
|
}
|
|
break;
|
|
case AtomicOp.BitwiseOr:
|
|
if (type == ReductionType.S32 || type == ReductionType.U32)
|
|
{
|
|
res = context.AtomicOr(mr, addrLow, addrHigh, value);
|
|
}
|
|
else
|
|
{
|
|
context.Config.GpuAccessor.Log($"Invalid reduction type: {type}.");
|
|
}
|
|
break;
|
|
case AtomicOp.Maximum:
|
|
if (type == ReductionType.S32)
|
|
{
|
|
res = context.AtomicMaxS32(mr, addrLow, addrHigh, value);
|
|
}
|
|
else if (type == ReductionType.U32)
|
|
{
|
|
res = context.AtomicMaxU32(mr, addrLow, addrHigh, value);
|
|
}
|
|
else
|
|
{
|
|
context.Config.GpuAccessor.Log($"Invalid reduction type: {type}.");
|
|
}
|
|
break;
|
|
case AtomicOp.Minimum:
|
|
if (type == ReductionType.S32)
|
|
{
|
|
res = context.AtomicMinS32(mr, addrLow, addrHigh, value);
|
|
}
|
|
else if (type == ReductionType.U32)
|
|
{
|
|
res = context.AtomicMinU32(mr, addrLow, addrHigh, value);
|
|
}
|
|
else
|
|
{
|
|
context.Config.GpuAccessor.Log($"Invalid reduction type: {type}.");
|
|
}
|
|
break;
|
|
}
|
|
|
|
return res;
|
|
}
|
|
|
|
private static void EmitLoad(EmitterContext context, MemoryRegion region)
|
|
{
|
|
OpCodeMemory op = (OpCodeMemory)context.CurrOp;
|
|
|
|
if (op.Size > IntegerSize.B128)
|
|
{
|
|
context.Config.GpuAccessor.Log($"Invalid load size: {op.Size}.");
|
|
}
|
|
|
|
bool isSmallInt = op.Size < IntegerSize.B32;
|
|
|
|
int count = 1;
|
|
|
|
switch (op.Size)
|
|
{
|
|
case IntegerSize.B64: count = 2; break;
|
|
case IntegerSize.B128: count = 4; break;
|
|
}
|
|
|
|
Operand baseOffset = context.IAdd(GetSrcA(context), Const(op.Offset));
|
|
|
|
// Word offset = byte offset / 4 (one word = 4 bytes).
|
|
Operand wordOffset = context.ShiftRightU32(baseOffset, Const(2));
|
|
|
|
Operand bitOffset = GetBitOffset(context, baseOffset);
|
|
|
|
for (int index = 0; index < count; index++)
|
|
{
|
|
Register rd = new Register(op.Rd.Index + index, RegisterType.Gpr);
|
|
|
|
if (rd.IsRZ)
|
|
{
|
|
break;
|
|
}
|
|
|
|
Operand offset = context.IAdd(wordOffset, Const(index));
|
|
|
|
Operand value = null;
|
|
|
|
switch (region)
|
|
{
|
|
case MemoryRegion.Local: value = context.LoadLocal (offset); break;
|
|
case MemoryRegion.Shared: value = context.LoadShared(offset); break;
|
|
}
|
|
|
|
if (isSmallInt)
|
|
{
|
|
value = ExtractSmallInt(context, op.Size, bitOffset, value);
|
|
}
|
|
|
|
context.Copy(Register(rd), value);
|
|
}
|
|
}
|
|
|
|
private static void EmitLoadGlobal(EmitterContext context)
|
|
{
|
|
OpCodeMemory op = (OpCodeMemory)context.CurrOp;
|
|
|
|
bool isSmallInt = op.Size < IntegerSize.B32;
|
|
|
|
int count = GetVectorCount(op.Size);
|
|
|
|
(Operand addrLow, Operand addrHigh) = Get40BitsAddress(context, op.Ra, op.Extended, op.Offset);
|
|
|
|
Operand bitOffset = GetBitOffset(context, addrLow);
|
|
|
|
for (int index = 0; index < count; index++)
|
|
{
|
|
Register rd = new Register(op.Rd.Index + index, RegisterType.Gpr);
|
|
|
|
if (rd.IsRZ)
|
|
{
|
|
break;
|
|
}
|
|
|
|
Operand value = context.LoadGlobal(context.IAdd(addrLow, Const(index * 4)), addrHigh);
|
|
|
|
if (isSmallInt)
|
|
{
|
|
value = ExtractSmallInt(context, op.Size, bitOffset, value);
|
|
}
|
|
|
|
context.Copy(Register(rd), value);
|
|
}
|
|
}
|
|
|
|
private static void EmitStore(EmitterContext context, MemoryRegion region)
|
|
{
|
|
OpCodeMemory op = (OpCodeMemory)context.CurrOp;
|
|
|
|
if (op.Size > IntegerSize.B128)
|
|
{
|
|
context.Config.GpuAccessor.Log($"Invalid store size: {op.Size}.");
|
|
}
|
|
|
|
bool isSmallInt = op.Size < IntegerSize.B32;
|
|
|
|
int count = 1;
|
|
|
|
switch (op.Size)
|
|
{
|
|
case IntegerSize.B64: count = 2; break;
|
|
case IntegerSize.B128: count = 4; break;
|
|
}
|
|
|
|
Operand baseOffset = context.IAdd(GetSrcA(context), Const(op.Offset));
|
|
|
|
Operand wordOffset = context.ShiftRightU32(baseOffset, Const(2));
|
|
|
|
Operand bitOffset = GetBitOffset(context, baseOffset);
|
|
|
|
for (int index = 0; index < count; index++)
|
|
{
|
|
Register rd = new Register(op.Rd.Index + index, RegisterType.Gpr);
|
|
|
|
Operand value = Register(rd);
|
|
|
|
Operand offset = context.IAdd(wordOffset, Const(index));
|
|
|
|
if (isSmallInt)
|
|
{
|
|
Operand word = null;
|
|
|
|
switch (region)
|
|
{
|
|
case MemoryRegion.Local: word = context.LoadLocal (offset); break;
|
|
case MemoryRegion.Shared: word = context.LoadShared(offset); break;
|
|
}
|
|
|
|
value = InsertSmallInt(context, op.Size, bitOffset, word, value);
|
|
}
|
|
|
|
switch (region)
|
|
{
|
|
case MemoryRegion.Local: context.StoreLocal (offset, value); break;
|
|
case MemoryRegion.Shared: context.StoreShared(offset, value); break;
|
|
}
|
|
|
|
if (rd.IsRZ)
|
|
{
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
private static void EmitStoreGlobal(EmitterContext context)
|
|
{
|
|
OpCodeMemory op = (OpCodeMemory)context.CurrOp;
|
|
|
|
bool isSmallInt = op.Size < IntegerSize.B32;
|
|
|
|
int count = GetVectorCount(op.Size);
|
|
|
|
(Operand addrLow, Operand addrHigh) = Get40BitsAddress(context, op.Ra, op.Extended, op.Offset);
|
|
|
|
Operand bitOffset = GetBitOffset(context, addrLow);
|
|
|
|
for (int index = 0; index < count; index++)
|
|
{
|
|
Register rd = new Register(op.Rd.Index + index, RegisterType.Gpr);
|
|
|
|
Operand value = Register(rd);
|
|
|
|
if (isSmallInt)
|
|
{
|
|
Operand word = context.LoadGlobal(addrLow, addrHigh);
|
|
|
|
value = InsertSmallInt(context, op.Size, bitOffset, word, value);
|
|
}
|
|
|
|
context.StoreGlobal(context.IAdd(addrLow, Const(index * 4)), addrHigh, value);
|
|
|
|
if (rd.IsRZ)
|
|
{
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
private static int GetVectorCount(IntegerSize size)
|
|
{
|
|
switch (size)
|
|
{
|
|
case IntegerSize.B64:
|
|
return 2;
|
|
case IntegerSize.B128:
|
|
case IntegerSize.UB128:
|
|
return 4;
|
|
}
|
|
|
|
return 1;
|
|
}
|
|
|
|
private static (Operand, Operand) Get40BitsAddress(
|
|
EmitterContext context,
|
|
Register ra,
|
|
bool extended,
|
|
int offset)
|
|
{
|
|
Operand addrLow = GetSrcA(context);
|
|
Operand addrHigh;
|
|
|
|
if (extended && !ra.IsRZ)
|
|
{
|
|
addrHigh = Register(ra.Index + 1, RegisterType.Gpr);
|
|
}
|
|
else
|
|
{
|
|
addrHigh = Const(0);
|
|
}
|
|
|
|
Operand offs = Const(offset);
|
|
|
|
addrLow = context.IAdd(addrLow, offs);
|
|
|
|
if (extended)
|
|
{
|
|
Operand carry = context.ICompareLessUnsigned(addrLow, offs);
|
|
|
|
addrHigh = context.IAdd(addrHigh, context.ConditionalSelect(carry, Const(1), Const(0)));
|
|
}
|
|
|
|
return (addrLow, addrHigh);
|
|
}
|
|
|
|
private static Operand GetBitOffset(EmitterContext context, Operand baseOffset)
|
|
{
|
|
// Note: bit offset = (baseOffset & 0b11) * 8.
|
|
// Addresses should be always aligned to the integer type,
|
|
// so we don't need to take unaligned addresses into account.
|
|
return context.ShiftLeft(context.BitwiseAnd(baseOffset, Const(3)), Const(3));
|
|
}
|
|
|
|
private static Operand ExtractSmallInt(
|
|
EmitterContext context,
|
|
IntegerSize size,
|
|
Operand bitOffset,
|
|
Operand value)
|
|
{
|
|
value = context.ShiftRightU32(value, bitOffset);
|
|
|
|
switch (size)
|
|
{
|
|
case IntegerSize.U8: value = ZeroExtendTo32(context, value, 8); break;
|
|
case IntegerSize.U16: value = ZeroExtendTo32(context, value, 16); break;
|
|
case IntegerSize.S8: value = SignExtendTo32(context, value, 8); break;
|
|
case IntegerSize.S16: value = SignExtendTo32(context, value, 16); break;
|
|
}
|
|
|
|
return value;
|
|
}
|
|
|
|
private static Operand InsertSmallInt(
|
|
EmitterContext context,
|
|
IntegerSize size,
|
|
Operand bitOffset,
|
|
Operand word,
|
|
Operand value)
|
|
{
|
|
switch (size)
|
|
{
|
|
case IntegerSize.U8:
|
|
case IntegerSize.S8:
|
|
value = context.BitwiseAnd(value, Const(0xff));
|
|
value = context.BitfieldInsert(word, value, bitOffset, Const(8));
|
|
break;
|
|
|
|
case IntegerSize.U16:
|
|
case IntegerSize.S16:
|
|
value = context.BitwiseAnd(value, Const(0xffff));
|
|
value = context.BitfieldInsert(word, value, bitOffset, Const(16));
|
|
break;
|
|
}
|
|
|
|
return value;
|
|
}
|
|
}
|
|
} |