From d9d18439f6900fd9f05bde41998526281f7638c5 Mon Sep 17 00:00:00 2001 From: gdkchan Date: Wed, 11 Aug 2021 15:59:42 -0300 Subject: [PATCH] Use a new approach for shader BRX targets (#2532) * Use a new approach for shader BRX targets * Make shader cache actually work * Improve the shader pattern matching a bit * Extend LDC search to predecessor blocks, catches more cases * Nit * Only save the amount of constant buffer data actually used. Avoids crashes on partially mapped buffers * Ignore Rd on predicate instructions, as they do not have a Rd register (catches more cases) --- .../Shader/Cache/CacheCollection.cs | 51 ++- .../Shader/Cache/CacheHelper.cs | 53 ++- .../Shader/Cache/CacheManager.cs | 10 + .../Definition/GuestShaderCacheEntryHeader.cs | 10 +- .../Shader/CachedGpuAccessor.cs | 20 +- Ryujinx.Graphics.Gpu/Shader/GpuAccessor.cs | 21 ++ Ryujinx.Graphics.Gpu/Shader/ShaderCache.cs | 59 ++- .../Shader/ShaderCompileTask.cs | 3 +- Ryujinx.Graphics.Shader/Decoders/Block.cs | 36 +- Ryujinx.Graphics.Shader/Decoders/Decoder.cs | 347 ++++++++++++------ Ryujinx.Graphics.Shader/IGpuAccessor.cs | 5 + .../Instructions/InstEmitFlow.cs | 6 + 12 files changed, 472 insertions(+), 149 deletions(-) diff --git a/Ryujinx.Graphics.Gpu/Shader/Cache/CacheCollection.cs b/Ryujinx.Graphics.Gpu/Shader/Cache/CacheCollection.cs index 2660e528..316e027f 100644 --- a/Ryujinx.Graphics.Gpu/Shader/Cache/CacheCollection.cs +++ b/Ryujinx.Graphics.Gpu/Shader/Cache/CacheCollection.cs @@ -38,6 +38,11 @@ namespace Ryujinx.Graphics.Gpu.Shader.Cache /// RemoveManifestEntries, + /// + /// Remove entries from the hash manifest and save it, and also deletes the temporary file. + /// + RemoveManifestEntryAndTempFile, + /// /// Flush temporary cache to archive. /// @@ -116,6 +121,9 @@ namespace Ryujinx.Graphics.Gpu.Shader.Cache /// private ZipArchive _cacheArchive; + /// + /// Indicates if the cache collection supports modification. + /// public bool IsReadOnly { get; } /// @@ -264,6 +272,21 @@ namespace Ryujinx.Graphics.Gpu.Shader.Cache } } + /// + /// Remove given entry from the manifest and delete the temporary file. + /// + /// Entry to remove from the manifest + private void RemoveManifestEntryAndTempFile(Hash128 entry) + { + lock (_hashTable) + { + _hashTable.Remove(entry); + SaveManifest(); + } + + File.Delete(GenCacheTempFilePath(entry)); + } + /// /// Queue a task to flush temporary files to the archive on the worker. /// @@ -440,6 +463,9 @@ namespace Ryujinx.Graphics.Gpu.Shader.Cache case CacheFileOperation.RemoveManifestEntries: RemoveManifestEntries((HashSet)task.Data); break; + case CacheFileOperation.RemoveManifestEntryAndTempFile: + RemoveManifestEntryAndTempFile((Hash128)task.Data); + break; case CacheFileOperation.FlushToArchive: FlushToArchive(); break; @@ -472,7 +498,7 @@ namespace Ryujinx.Graphics.Gpu.Shader.Cache { if (IsReadOnly) { - Logger.Warning?.Print(LogClass.Gpu, "Trying to add {keyHash} on a read-only cache, ignoring."); + Logger.Warning?.Print(LogClass.Gpu, $"Trying to add {keyHash} on a read-only cache, ignoring."); return; } @@ -521,7 +547,7 @@ namespace Ryujinx.Graphics.Gpu.Shader.Cache { if (IsReadOnly) { - Logger.Warning?.Print(LogClass.Gpu, "Trying to replace {keyHash} on a read-only cache, ignoring."); + Logger.Warning?.Print(LogClass.Gpu, $"Trying to replace {keyHash} on a read-only cache, ignoring."); return; } @@ -540,6 +566,27 @@ namespace Ryujinx.Graphics.Gpu.Shader.Cache }); } + /// + /// Removes a value at the given hash from the cache. + /// + /// The hash of the value in the cache + public void RemoveValue(ref Hash128 keyHash) + { + if (IsReadOnly) + { + Logger.Warning?.Print(LogClass.Gpu, $"Trying to remove {keyHash} on a read-only cache, ignoring."); + + return; + } + + // Only queue file change operations + _fileWriterWorkerQueue.Add(new CacheFileOperationTask + { + Type = CacheFileOperation.RemoveManifestEntryAndTempFile, + Data = keyHash + }); + } + public void Dispose() { Dispose(true); diff --git a/Ryujinx.Graphics.Gpu/Shader/Cache/CacheHelper.cs b/Ryujinx.Graphics.Gpu/Shader/Cache/CacheHelper.cs index f6caddef..33da42db 100644 --- a/Ryujinx.Graphics.Gpu/Shader/Cache/CacheHelper.cs +++ b/Ryujinx.Graphics.Gpu/Shader/Cache/CacheHelper.cs @@ -371,11 +371,13 @@ namespace Ryujinx.Graphics.Gpu.Shader.Cache /// /// Create guest shader cache entries from the runtime contexts. /// - /// The GPU memory manager in use + /// The GPU channel in use /// The runtime contexts /// Guest shader cahe entries from the runtime contexts - public static GuestShaderCacheEntry[] CreateShaderCacheEntries(MemoryManager memoryManager, ReadOnlySpan shaderContexts) + public static GuestShaderCacheEntry[] CreateShaderCacheEntries(GpuChannel channel, ReadOnlySpan shaderContexts) { + MemoryManager memoryManager = channel.MemoryManager; + int startIndex = shaderContexts.Length > 1 ? 1 : 0; GuestShaderCacheEntry[] entries = new GuestShaderCacheEntry[shaderContexts.Length - startIndex]; @@ -389,31 +391,66 @@ namespace Ryujinx.Graphics.Gpu.Shader.Cache continue; } + GpuAccessor gpuAccessor = context.GpuAccessor as GpuAccessor; + + ulong cb1DataAddress; + int cb1DataSize = gpuAccessor?.Cb1DataSize ?? 0; + + if (context.Stage == ShaderStage.Compute) + { + cb1DataAddress = channel.BufferManager.GetComputeUniformBufferAddress(1); + } + else + { + int stageIndex = context.Stage switch + { + ShaderStage.TessellationControl => 1, + ShaderStage.TessellationEvaluation => 2, + ShaderStage.Geometry => 3, + ShaderStage.Fragment => 4, + _ => 0 + }; + + cb1DataAddress = channel.BufferManager.GetGraphicsUniformBufferAddress(stageIndex, 1); + } + + int size = context.Size; + TranslatorContext translatorContext2 = i == 1 ? shaderContexts[0] : null; int sizeA = translatorContext2 != null ? translatorContext2.Size : 0; - byte[] code = new byte[context.Size + sizeA]; + byte[] code = new byte[size + cb1DataSize + sizeA]; - memoryManager.GetSpan(context.Address, context.Size).CopyTo(code); + memoryManager.GetSpan(context.Address, size).CopyTo(code); + + if (cb1DataAddress != 0 && cb1DataSize != 0) + { + memoryManager.Physical.GetSpan(cb1DataAddress, cb1DataSize).CopyTo(code.AsSpan().Slice(size, cb1DataSize)); + } if (translatorContext2 != null) { - memoryManager.GetSpan(translatorContext2.Address, sizeA).CopyTo(code.AsSpan().Slice(context.Size, sizeA)); + memoryManager.GetSpan(translatorContext2.Address, sizeA).CopyTo(code.AsSpan().Slice(size + cb1DataSize, sizeA)); } GuestGpuAccessorHeader gpuAccessorHeader = CreateGuestGpuAccessorCache(context.GpuAccessor); - if (context.GpuAccessor is GpuAccessor) + if (gpuAccessor != null) { gpuAccessorHeader.TextureDescriptorCount = context.TextureHandlesForCache.Count; } - GuestShaderCacheEntryHeader header = new GuestShaderCacheEntryHeader(context.Stage, context.Size, sizeA, gpuAccessorHeader); + GuestShaderCacheEntryHeader header = new GuestShaderCacheEntryHeader( + context.Stage, + size + cb1DataSize, + sizeA, + cb1DataSize, + gpuAccessorHeader); GuestShaderCacheEntry entry = new GuestShaderCacheEntry(header, code); - if (context.GpuAccessor is GpuAccessor gpuAccessor) + if (gpuAccessor != null) { foreach (int textureHandle in context.TextureHandlesForCache) { diff --git a/Ryujinx.Graphics.Gpu/Shader/Cache/CacheManager.cs b/Ryujinx.Graphics.Gpu/Shader/Cache/CacheManager.cs index 1ac37704..3fc11e82 100644 --- a/Ryujinx.Graphics.Gpu/Shader/Cache/CacheManager.cs +++ b/Ryujinx.Graphics.Gpu/Shader/Cache/CacheManager.cs @@ -114,6 +114,16 @@ namespace Ryujinx.Graphics.Gpu.Shader.Cache _hostProgramCache.ReplaceValue(ref programCodeHash, data); } + /// + /// Removes a shader program present in the program cache. + /// + /// Target program code hash + public void RemoveProgram(ref Hash128 programCodeHash) + { + _guestProgramCache.RemoveValue(ref programCodeHash); + _hostProgramCache.RemoveValue(ref programCodeHash); + } + /// /// Get all guest program hashes. /// diff --git a/Ryujinx.Graphics.Gpu/Shader/Cache/Definition/GuestShaderCacheEntryHeader.cs b/Ryujinx.Graphics.Gpu/Shader/Cache/Definition/GuestShaderCacheEntryHeader.cs index 6d5bb28d..9b22cac5 100644 --- a/Ryujinx.Graphics.Gpu/Shader/Cache/Definition/GuestShaderCacheEntryHeader.cs +++ b/Ryujinx.Graphics.Gpu/Shader/Cache/Definition/GuestShaderCacheEntryHeader.cs @@ -40,9 +40,9 @@ namespace Ryujinx.Graphics.Gpu.Shader.Cache.Definition public int SizeA; /// - /// Unused/reserved. + /// Constant buffer 1 data size. /// - public int Reserved4; + public int Cb1DataSize; /// /// The header of the cached gpu accessor. @@ -55,12 +55,14 @@ namespace Ryujinx.Graphics.Gpu.Shader.Cache.Definition /// The stage of this shader /// The size of the code section /// The size of the code2 section if present (Vertex A) + /// Constant buffer 1 data size /// The header of the cached gpu accessor - public GuestShaderCacheEntryHeader(ShaderStage stage, int size, int sizeA, GuestGpuAccessorHeader gpuAccessorHeader) : this() + public GuestShaderCacheEntryHeader(ShaderStage stage, int size, int sizeA, int cb1DataSize, GuestGpuAccessorHeader gpuAccessorHeader) : this() { Stage = stage; - Size = size; + Size = size; SizeA = sizeA; + Cb1DataSize = cb1DataSize; GpuAccessorHeader = gpuAccessorHeader; } } diff --git a/Ryujinx.Graphics.Gpu/Shader/CachedGpuAccessor.cs b/Ryujinx.Graphics.Gpu/Shader/CachedGpuAccessor.cs index a7bd4edb..452dfd83 100644 --- a/Ryujinx.Graphics.Gpu/Shader/CachedGpuAccessor.cs +++ b/Ryujinx.Graphics.Gpu/Shader/CachedGpuAccessor.cs @@ -11,6 +11,7 @@ namespace Ryujinx.Graphics.Gpu.Shader { private readonly GpuContext _context; private readonly ReadOnlyMemory _data; + private readonly ReadOnlyMemory _cb1Data; private readonly GuestGpuAccessorHeader _header; private readonly Dictionary _textureDescriptors; @@ -19,12 +20,19 @@ namespace Ryujinx.Graphics.Gpu.Shader /// /// GPU context /// The data of the shader + /// The constant buffer 1 data of the shader /// The cache of the GPU accessor /// The cache of the texture descriptors - public CachedGpuAccessor(GpuContext context, ReadOnlyMemory data, GuestGpuAccessorHeader header, Dictionary guestTextureDescriptors) + public CachedGpuAccessor( + GpuContext context, + ReadOnlyMemory data, + ReadOnlyMemory cb1Data, + GuestGpuAccessorHeader header, + Dictionary guestTextureDescriptors) { _context = context; _data = data; + _cb1Data = cb1Data; _header = header; _textureDescriptors = new Dictionary(); @@ -34,6 +42,16 @@ namespace Ryujinx.Graphics.Gpu.Shader } } + /// + /// Reads data from the constant buffer 1. + /// + /// Offset in bytes to read from + /// Value at the given offset + public uint ConstantBuffer1Read(int offset) + { + return MemoryMarshal.Cast(_cb1Data.Span.Slice(offset))[0]; + } + /// /// Prints a log message. /// diff --git a/Ryujinx.Graphics.Gpu/Shader/GpuAccessor.cs b/Ryujinx.Graphics.Gpu/Shader/GpuAccessor.cs index b7059b51..6254b1c2 100644 --- a/Ryujinx.Graphics.Gpu/Shader/GpuAccessor.cs +++ b/Ryujinx.Graphics.Gpu/Shader/GpuAccessor.cs @@ -20,6 +20,8 @@ namespace Ryujinx.Graphics.Gpu.Shader private readonly int _localMemorySize; private readonly int _sharedMemorySize; + public int Cb1DataSize { get; private set; } + /// /// Creates a new instance of the GPU state accessor for graphics shader translation. /// @@ -67,6 +69,25 @@ namespace Ryujinx.Graphics.Gpu.Shader _sharedMemorySize = sharedMemorySize; } + /// + /// Reads data from the constant buffer 1. + /// + /// Offset in bytes to read from + /// Value at the given offset + public uint ConstantBuffer1Read(int offset) + { + if (Cb1DataSize < offset + 4) + { + Cb1DataSize = offset + 4; + } + + ulong baseAddress = _compute + ? _channel.BufferManager.GetComputeUniformBufferAddress(1) + : _channel.BufferManager.GetGraphicsUniformBufferAddress(_stageIndex, 1); + + return _channel.MemoryManager.Physical.Read(baseAddress + (ulong)offset); + } + /// /// Prints a log message. /// diff --git a/Ryujinx.Graphics.Gpu/Shader/ShaderCache.cs b/Ryujinx.Graphics.Gpu/Shader/ShaderCache.cs index a5712a14..754449fb 100644 --- a/Ryujinx.Graphics.Gpu/Shader/ShaderCache.cs +++ b/Ryujinx.Graphics.Gpu/Shader/ShaderCache.cs @@ -38,7 +38,7 @@ namespace Ryujinx.Graphics.Gpu.Shader /// /// Version of the codegen (to be changed when codegen or guest format change). /// - private const ulong ShaderCodeGenVersion = 2469; + private const ulong ShaderCodeGenVersion = 2530; // Progress reporting helpers private volatile int _shaderCount; @@ -112,7 +112,7 @@ namespace Ryujinx.Graphics.Gpu.Shader int programIndex = 0; List activeTasks = new List(); - AutoResetEvent taskDoneEvent = new AutoResetEvent(false); + using AutoResetEvent taskDoneEvent = new AutoResetEvent(false); // This thread dispatches tasks to do shader translation, and creates programs that OpenGL will link in the background. // The program link status is checked in a non-blocking manner so that multiple shaders can be compiled at once. @@ -191,7 +191,14 @@ namespace Ryujinx.Graphics.Gpu.Shader Task compileTask = Task.Run(() => { - IGpuAccessor gpuAccessor = new CachedGpuAccessor(_context, entry.Code, entry.Header.GpuAccessorHeader, entry.TextureDescriptors); + var binaryCode = new Memory(entry.Code); + + var gpuAccessor = new CachedGpuAccessor( + _context, + binaryCode, + binaryCode.Slice(binaryCode.Length - entry.Header.Cb1DataSize), + entry.Header.GpuAccessorHeader, + entry.TextureDescriptors); var options = new TranslationOptions(TargetLanguage.Glsl, TargetApi.OpenGL, DefaultFlags | TranslationFlags.Compute); program = Translator.CreateContext(0, gpuAccessor, options).Translate(out shaderProgramInfo); @@ -199,12 +206,20 @@ namespace Ryujinx.Graphics.Gpu.Shader task.OnTask(compileTask, (bool _, ShaderCompileTask task) => { + if (task.IsFaulted) + { + Logger.Warning?.Print(LogClass.Gpu, $"Host shader {key} is corrupted or incompatible, discarding..."); + + _cacheManager.RemoveProgram(ref key); + return true; // Exit early, the decoding step failed. + } + ShaderCodeHolder shader = new ShaderCodeHolder(program, shaderProgramInfo, entry.Code); Logger.Info?.Print(LogClass.Gpu, $"Host shader {key} got invalidated, rebuilding from guest..."); // Compile shader and create program as the shader program binary got invalidated. - shader.HostShader = _context.Renderer.CompileShader(ShaderStage.Compute, shader.Program.Code); + shader.HostShader = _context.Renderer.CompileShader(ShaderStage.Compute, program.Code); hostProgram = _context.Renderer.CreateProgram(new IShader[] { shader.HostShader }, null); task.OnCompiled(hostProgram, (bool isNewProgramValid, ShaderCompileTask task) => @@ -298,7 +313,14 @@ namespace Ryujinx.Graphics.Gpu.Shader } else { - IGpuAccessor gpuAccessor = new CachedGpuAccessor(_context, entry.Code, entry.Header.GpuAccessorHeader, entry.TextureDescriptors); + var binaryCode = new Memory(entry.Code); + + var gpuAccessor = new CachedGpuAccessor( + _context, + binaryCode, + binaryCode.Slice(binaryCode.Length - entry.Header.Cb1DataSize), + entry.Header.GpuAccessorHeader, + entry.TextureDescriptors); var options = new TranslationOptions(TargetLanguage.Glsl, TargetApi.OpenGL, flags); var options2 = new TranslationOptions(TargetLanguage.Glsl, TargetApi.OpenGL, flags | TranslationFlags.VertexA); @@ -310,7 +332,7 @@ namespace Ryujinx.Graphics.Gpu.Shader } // NOTE: Vertex B comes first in the shader cache. - byte[] code = entry.Code.AsSpan().Slice(0, entry.Header.Size).ToArray(); + byte[] code = entry.Code.AsSpan().Slice(0, entry.Header.Size - entry.Header.Cb1DataSize).ToArray(); byte[] code2 = entry.Code.AsSpan().Slice(entry.Header.Size, entry.Header.SizeA).ToArray(); shaders[i] = new ShaderCodeHolder(program, shaderProgramInfo, code, code2); @@ -326,13 +348,22 @@ namespace Ryujinx.Graphics.Gpu.Shader } else { - IGpuAccessor gpuAccessor = new CachedGpuAccessor(_context, entry.Code, entry.Header.GpuAccessorHeader, entry.TextureDescriptors); + var binaryCode = new Memory(entry.Code); + + var gpuAccessor = new CachedGpuAccessor( + _context, + binaryCode, + binaryCode.Slice(binaryCode.Length - entry.Header.Cb1DataSize), + entry.Header.GpuAccessorHeader, + entry.TextureDescriptors); var options = new TranslationOptions(TargetLanguage.Glsl, TargetApi.OpenGL, flags); program = Translator.CreateContext(0, gpuAccessor, options, counts).Translate(out shaderProgramInfo); } - shaders[i] = new ShaderCodeHolder(program, shaderProgramInfo, entry.Code); + byte[] code = entry.Code.AsSpan().Slice(0, entry.Header.Size - entry.Header.Cb1DataSize).ToArray(); + + shaders[i] = new ShaderCodeHolder(program, shaderProgramInfo, code); } shaderPrograms.Add(program); @@ -341,6 +372,14 @@ namespace Ryujinx.Graphics.Gpu.Shader task.OnTask(compileTask, (bool _, ShaderCompileTask task) => { + if (task.IsFaulted) + { + Logger.Warning?.Print(LogClass.Gpu, $"Host shader {key} is corrupted or incompatible, discarding..."); + + _cacheManager.RemoveProgram(ref key); + return true; // Exit early, the decoding step failed. + } + // If the host program was rejected by the gpu driver or isn't in cache, try to build from program sources again. if (!isHostProgramValid) { @@ -537,7 +576,7 @@ namespace Ryujinx.Graphics.Gpu.Shader isShaderCacheReadOnly = _cacheManager.IsReadOnly; // Compute hash and prepare data for shader disk cache comparison. - shaderCacheEntries = CacheHelper.CreateShaderCacheEntries(channel.MemoryManager, shaderContexts); + shaderCacheEntries = CacheHelper.CreateShaderCacheEntries(channel, shaderContexts); programCodeHash = CacheHelper.ComputeGuestHashFromCache(shaderCacheEntries); } @@ -659,7 +698,7 @@ namespace Ryujinx.Graphics.Gpu.Shader isShaderCacheReadOnly = _cacheManager.IsReadOnly; // Compute hash and prepare data for shader disk cache comparison. - shaderCacheEntries = CacheHelper.CreateShaderCacheEntries(channel.MemoryManager, shaderContexts); + shaderCacheEntries = CacheHelper.CreateShaderCacheEntries(channel, shaderContexts); programCodeHash = CacheHelper.ComputeGuestHashFromCache(shaderCacheEntries, tfd); } diff --git a/Ryujinx.Graphics.Gpu/Shader/ShaderCompileTask.cs b/Ryujinx.Graphics.Gpu/Shader/ShaderCompileTask.cs index ff48fab0..a9283de2 100644 --- a/Ryujinx.Graphics.Gpu/Shader/ShaderCompileTask.cs +++ b/Ryujinx.Graphics.Gpu/Shader/ShaderCompileTask.cs @@ -1,5 +1,4 @@ using Ryujinx.Graphics.GAL; -using System; using System.Threading; using System.Threading.Tasks; @@ -20,6 +19,8 @@ namespace Ryujinx.Graphics.Gpu.Shader private ShaderCompileTaskCallback _action; private AutoResetEvent _taskDoneEvent; + public bool IsFaulted => _programsTask.IsFaulted; + /// /// Create a new shader compile task, with an event to signal whenever a subtask completes. /// diff --git a/Ryujinx.Graphics.Shader/Decoders/Block.cs b/Ryujinx.Graphics.Shader/Decoders/Block.cs index e1470237..69cb55b9 100644 --- a/Ryujinx.Graphics.Shader/Decoders/Block.cs +++ b/Ryujinx.Graphics.Shader/Decoders/Block.cs @@ -8,10 +8,38 @@ namespace Ryujinx.Graphics.Shader.Decoders public ulong Address { get; set; } public ulong EndAddress { get; set; } - public Block Next { get; set; } - public Block Branch { get; set; } + private Block _next; + private Block _branch; - public OpCodeBranchIndir BrIndir { get; set; } + public Block Next + { + get + { + return _next; + } + set + { + _next?.Predecessors.Remove(this); + value?.Predecessors.Add(this); + _next = value; + } + } + + public Block Branch + { + get + { + return _branch; + } + set + { + _branch?.Predecessors.Remove(this); + value?.Predecessors.Add(this); + _branch = value; + } + } + + public HashSet Predecessors { get; } public List OpCodes { get; } public List PushOpCodes { get; } @@ -20,6 +48,8 @@ namespace Ryujinx.Graphics.Shader.Decoders { Address = address; + Predecessors = new HashSet(); + OpCodes = new List(); PushOpCodes = new List(); } diff --git a/Ryujinx.Graphics.Shader/Decoders/Decoder.cs b/Ryujinx.Graphics.Shader/Decoders/Decoder.cs index 9ca58177..c916935e 100644 --- a/Ryujinx.Graphics.Shader/Decoders/Decoder.cs +++ b/Ryujinx.Graphics.Shader/Decoders/Decoder.cs @@ -9,8 +9,6 @@ namespace Ryujinx.Graphics.Shader.Decoders { static class Decoder { - public const ulong ShaderEndDelimiter = 0xe2400fffff87000f; - public static Block[][] Decode(IGpuAccessor gpuAccessor, ulong startAddress, out bool hasBindless) { hasBindless = false; @@ -51,130 +49,139 @@ namespace Ryujinx.Graphics.Shader.Decoders GetBlock(funcAddress); - while (workQueue.TryDequeue(out Block currBlock)) + bool hasNewTarget; + + do { - // Check if the current block is inside another block. - if (BinarySearch(blocks, currBlock.Address, out int nBlkIndex)) + while (workQueue.TryDequeue(out Block currBlock)) { - Block nBlock = blocks[nBlkIndex]; - - if (nBlock.Address == currBlock.Address) + // Check if the current block is inside another block. + if (BinarySearch(blocks, currBlock.Address, out int nBlkIndex)) { - throw new InvalidOperationException("Found duplicate block address on the list."); - } + Block nBlock = blocks[nBlkIndex]; - nBlock.Split(currBlock); - blocks.Insert(nBlkIndex + 1, currBlock); - - continue; - } - - // If we have a block after the current one, set the limit address. - ulong limitAddress = ulong.MaxValue; - - if (nBlkIndex != blocks.Count) - { - Block nBlock = blocks[nBlkIndex]; - - int nextIndex = nBlkIndex + 1; - - if (nBlock.Address < currBlock.Address && nextIndex < blocks.Count) - { - limitAddress = blocks[nextIndex].Address; - } - else if (nBlock.Address > currBlock.Address) - { - limitAddress = blocks[nBlkIndex].Address; - } - } - - FillBlock(gpuAccessor, currBlock, limitAddress, startAddress, out bool blockHasBindless); - hasBindless |= blockHasBindless; - - if (currBlock.OpCodes.Count != 0) - { - // We should have blocks for all possible branch targets, - // including those from SSY/PBK instructions. - foreach (OpCodePush pushOp in currBlock.PushOpCodes) - { - GetBlock(pushOp.GetAbsoluteAddress()); - } - - // Set child blocks. "Branch" is the block the branch instruction - // points to (when taken), "Next" is the block at the next address, - // executed when the branch is not taken. For Unconditional Branches - // or end of program, Next is null. - OpCode lastOp = currBlock.GetLastOp(); - - if (lastOp is OpCodeBranch opBr) - { - if (lastOp.Emitter == InstEmit.Cal) + if (nBlock.Address == currBlock.Address) { - EnqueueFunction(opBr.GetAbsoluteAddress()); + throw new InvalidOperationException("Found duplicate block address on the list."); } - else + + nBlock.Split(currBlock); + blocks.Insert(nBlkIndex + 1, currBlock); + + continue; + } + + // If we have a block after the current one, set the limit address. + ulong limitAddress = ulong.MaxValue; + + if (nBlkIndex != blocks.Count) + { + Block nBlock = blocks[nBlkIndex]; + + int nextIndex = nBlkIndex + 1; + + if (nBlock.Address < currBlock.Address && nextIndex < blocks.Count) { - currBlock.Branch = GetBlock(opBr.GetAbsoluteAddress()); + limitAddress = blocks[nextIndex].Address; + } + else if (nBlock.Address > currBlock.Address) + { + limitAddress = blocks[nBlkIndex].Address; } } - else if (lastOp is OpCodeBranchIndir opBrIndir) + + FillBlock(gpuAccessor, currBlock, limitAddress, startAddress, out bool blockHasBindless); + hasBindless |= blockHasBindless; + + if (currBlock.OpCodes.Count != 0) { - // An indirect branch could go anywhere, we don't know the target. - // Those instructions are usually used on a switch to jump table - // compiler optimization, and in those cases the possible targets - // seems to be always right after the BRX itself. We can assume - // that the possible targets are all the blocks in-between the - // instruction right after the BRX, and the common target that - // all the "cases" should eventually jump to, acting as the - // switch break. - Block firstTarget = GetBlock(currBlock.EndAddress); + // We should have blocks for all possible branch targets, + // including those from SSY/PBK instructions. + foreach (OpCodePush pushOp in currBlock.PushOpCodes) + { + GetBlock(pushOp.GetAbsoluteAddress()); + } - firstTarget.BrIndir = opBrIndir; + // Set child blocks. "Branch" is the block the branch instruction + // points to (when taken), "Next" is the block at the next address, + // executed when the branch is not taken. For Unconditional Branches + // or end of program, Next is null. + OpCode lastOp = currBlock.GetLastOp(); - opBrIndir.PossibleTargets.Add(firstTarget); + if (lastOp is OpCodeBranch opBr) + { + if (lastOp.Emitter == InstEmit.Cal) + { + EnqueueFunction(opBr.GetAbsoluteAddress()); + } + else + { + currBlock.Branch = GetBlock(opBr.GetAbsoluteAddress()); + } + } + + if (!IsUnconditionalBranch(lastOp)) + { + currBlock.Next = GetBlock(currBlock.EndAddress); + } } - if (!IsUnconditionalBranch(lastOp)) + // Insert the new block on the list (sorted by address). + if (blocks.Count != 0) { - currBlock.Next = GetBlock(currBlock.EndAddress); + Block nBlock = blocks[nBlkIndex]; + + blocks.Insert(nBlkIndex + (nBlock.Address < currBlock.Address ? 1 : 0), currBlock); + } + else + { + blocks.Add(currBlock); } } - // Insert the new block on the list (sorted by address). - if (blocks.Count != 0) + // Propagate SSY/PBK addresses into their uses (SYNC/BRK). + foreach (Block block in blocks.Where(x => x.PushOpCodes.Count != 0)) { - Block nBlock = blocks[nBlkIndex]; - - blocks.Insert(nBlkIndex + (nBlock.Address < currBlock.Address ? 1 : 0), currBlock); - } - else - { - blocks.Add(currBlock); - } - - // Do we have a block after the current one? - if (currBlock.BrIndir != null && HasBlockAfter(gpuAccessor, currBlock, startAddress)) - { - bool targetVisited = visited.ContainsKey(currBlock.EndAddress); - - Block possibleTarget = GetBlock(currBlock.EndAddress); - - currBlock.BrIndir.PossibleTargets.Add(possibleTarget); - - if (!targetVisited) + for (int pushOpIndex = 0; pushOpIndex < block.PushOpCodes.Count; pushOpIndex++) { - possibleTarget.BrIndir = currBlock.BrIndir; + PropagatePushOp(visited, block, pushOpIndex); } } + + // Try to find target for BRX (indirect branch) instructions. + hasNewTarget = false; + + foreach (Block block in blocks) + { + if (block.GetLastOp() is OpCodeBranchIndir opBrIndir && opBrIndir.PossibleTargets.Count == 0) + { + ulong baseOffset = opBrIndir.Address + 8 + (ulong)opBrIndir.Offset; + + // An indirect branch could go anywhere, + // try to get the possible target offsets from the constant buffer. + (int cbBaseOffset, int cbOffsetsCount) = FindBrxTargetRange(block, opBrIndir.Ra.Index); + + if (cbOffsetsCount != 0) + { + hasNewTarget = true; + } + + for (int i = 0; i < cbOffsetsCount; i++) + { + uint targetOffset = gpuAccessor.ConstantBuffer1Read(cbBaseOffset + i * 4); + Block target = GetBlock(baseOffset + targetOffset); + opBrIndir.PossibleTargets.Add(target); + target.Predecessors.Add(block); + } + } + } + + // If we discovered new branch targets from the BRX instruction, + // we need another round of decoding to decode the new blocks. + // Additionally, we may have more SSY/PBK targets to propagate, + // and new BRX instructions. } - - foreach (Block block in blocks.Where(x => x.PushOpCodes.Count != 0)) - { - for (int pushOpIndex = 0; pushOpIndex < block.PushOpCodes.Count; pushOpIndex++) - { - PropagatePushOp(visited, block, pushOpIndex); - } - } + while (hasNewTarget); funcs.Add(blocks.ToArray()); } @@ -182,19 +189,6 @@ namespace Ryujinx.Graphics.Shader.Decoders return funcs.ToArray(); } - private static bool HasBlockAfter(IGpuAccessor gpuAccessor, Block currBlock, ulong startAdddress) - { - if (!gpuAccessor.MemoryMapped(startAdddress + currBlock.EndAddress) || - !gpuAccessor.MemoryMapped(startAdddress + currBlock.EndAddress + 7)) - { - return false; - } - - ulong inst = gpuAccessor.MemoryRead(startAdddress + currBlock.EndAddress); - - return inst != 0UL && inst != ShaderEndDelimiter; - } - private static bool BinarySearch(List blocks, ulong address, out int index) { index = 0; @@ -320,6 +314,115 @@ namespace Ryujinx.Graphics.Shader.Decoders opCode is OpCodeExit; } + private static (int, int) FindBrxTargetRange(Block block, int brxReg) + { + // Try to match the following pattern: + // + // IMNMX.U32 Rx, Rx, UpperBound, PT + // SHL Rx, Rx, 0x2 + // LDC Rx, c[0x1][Rx+BaseOffset] + // + // Here, Rx is an arbitrary register, "UpperBound" and "BaseOffset" are constants. + // The above pattern is assumed to be generated by the compiler before BRX, + // as the instruction is usually used to implement jump tables for switch statement optimizations. + // On a successful match, "BaseOffset" is the offset in bytes where the jump offsets are + // located on the constant buffer, and "UpperBound" is the total number of offsets for the BRX, minus 1. + + HashSet visited = new HashSet(); + + var ldcLocation = FindFirstRegWrite(visited, new BlockLocation(block, block.OpCodes.Count - 1), brxReg); + if (ldcLocation.Block == null || ldcLocation.Block.OpCodes[ldcLocation.Index] is not OpCodeLdc opLdc) + { + return (0, 0); + } + + if (opLdc.Slot != 1 || opLdc.IndexMode != CbIndexMode.Default) + { + return (0, 0); + } + + var shlLocation = FindFirstRegWrite(visited, ldcLocation, opLdc.Ra.Index); + if (shlLocation.Block == null || shlLocation.Block.OpCodes[shlLocation.Index] is not OpCodeAluImm opShl) + { + return (0, 0); + } + + if (opShl.Emitter != InstEmit.Shl || opShl.Immediate != 2) + { + return (0, 0); + } + + var imnmxLocation = FindFirstRegWrite(visited, shlLocation, opShl.Ra.Index); + if (imnmxLocation.Block == null || imnmxLocation.Block.OpCodes[imnmxLocation.Index] is not OpCodeAluImm opImnmx) + { + return (0, 0); + } + + bool isImnmxS32 = opImnmx.RawOpCode.Extract(48); + + if (opImnmx.Emitter != InstEmit.Imnmx || isImnmxS32 || !opImnmx.Predicate39.IsPT || opImnmx.InvertP) + { + return (0, 0); + } + + return (opLdc.Offset, opImnmx.Immediate + 1); + } + + private struct BlockLocation + { + public Block Block { get; } + public int Index { get; } + + public BlockLocation(Block block, int index) + { + Block = block; + Index = index; + } + } + + private static BlockLocation FindFirstRegWrite(HashSet visited, BlockLocation location, int regIndex) + { + Queue toVisit = new Queue(); + toVisit.Enqueue(location); + visited.Add(location.Block); + + while (toVisit.TryDequeue(out var currentLocation)) + { + Block block = currentLocation.Block; + for (int i = currentLocation.Index - 1; i >= 0; i--) + { + if (WritesToRegister(block.OpCodes[i], regIndex)) + { + return new BlockLocation(block, i); + } + } + + foreach (Block predecessor in block.Predecessors) + { + if (visited.Add(predecessor)) + { + toVisit.Enqueue(new BlockLocation(predecessor, predecessor.OpCodes.Count)); + } + } + } + + return new BlockLocation(null, 0); + } + + private static bool WritesToRegister(OpCode opCode, int regIndex) + { + // Predicate instruction only ever writes to predicate, so we shouldn't check those. + if (opCode.Emitter == InstEmit.Fsetp || + opCode.Emitter == InstEmit.Hsetp2 || + opCode.Emitter == InstEmit.Isetp || + opCode.Emitter == InstEmit.R2p) + { + return false; + } + + return opCode is IOpCodeRd opRd && opRd.Rd.Index == regIndex; + } + private enum MergeType { Brk = 0, @@ -388,6 +491,8 @@ namespace Ryujinx.Graphics.Shader.Decoders { OpCodePush pushOp = currBlock.PushOpCodes[pushOpIndex]; + Block target = blocks[pushOp.GetAbsoluteAddress()]; + Stack workQueue = new Stack(); HashSet visited = new HashSet(); @@ -497,10 +602,12 @@ namespace Ryujinx.Graphics.Shader.Decoders if (branchStack.Count == 0) { // If the entire stack was consumed, then the current pop instruction - // just consumed the address from out push instruction. - op.Targets.Add(pushOp, op.Targets.Count); - - pushOp.PopOps.TryAdd(op, Local()); + // just consumed the address from our push instruction. + if (op.Targets.TryAdd(pushOp, op.Targets.Count)) + { + pushOp.PopOps.Add(op, Local()); + target.Predecessors.Add(current); + } } else { diff --git a/Ryujinx.Graphics.Shader/IGpuAccessor.cs b/Ryujinx.Graphics.Shader/IGpuAccessor.cs index 26a8cafd..04f23061 100644 --- a/Ryujinx.Graphics.Shader/IGpuAccessor.cs +++ b/Ryujinx.Graphics.Shader/IGpuAccessor.cs @@ -7,6 +7,11 @@ // No default log output. } + uint ConstantBuffer1Read(int offset) + { + return 0; + } + T MemoryRead(ulong address) where T : unmanaged; bool MemoryMapped(ulong address) diff --git a/Ryujinx.Graphics.Shader/Instructions/InstEmitFlow.cs b/Ryujinx.Graphics.Shader/Instructions/InstEmitFlow.cs index d4ab5955..1f5bf35b 100644 --- a/Ryujinx.Graphics.Shader/Instructions/InstEmitFlow.cs +++ b/Ryujinx.Graphics.Shader/Instructions/InstEmitFlow.cs @@ -25,6 +25,12 @@ namespace Ryujinx.Graphics.Shader.Instructions { OpCodeBranchIndir op = (OpCodeBranchIndir)context.CurrOp; + if (op.PossibleTargets.Count == 0) + { + context.Config.GpuAccessor.Log($"Failed to find targets for BRX instruction at 0x{op.Address:X}."); + return; + } + int offset = (int)op.Address + 8 + op.Offset; Operand address = context.IAdd(Register(op.Ra), Const(offset));