From 93cd327873bed829c3a0aad938cb8d2cca2ff806 Mon Sep 17 00:00:00 2001 From: riperiperi Date: Sat, 2 Sep 2023 21:58:15 +0100 Subject: [PATCH] Vulkan: Device Local and higher invocation count for buffer conversions (#5623) Just some simple changes to the buffer conversion shaders. (stride conversion, D32S8 to D24S8) The first change is using a device local buffer for converted vertex buffers, since they're only read/written on the GPU. These paths don't trigger on NVIDIA, but if you force them to use it demonstrates the full extent writing to host owned memory from compute absolutely destroys them. AMD GPUs are less heavily affected by this issue, but since the game in question was writing 230MB from compute, I imagine it should have some effect. The second change is allowing the buffer conversion shaders to scale their work group count. While dividing the work between 32 invocations works OK for M1 macs, it's not so great for anything with more cores like AMD GPUs, which should be able to do a lot more parallel copies. Now, it scales by roughly 100 elements per invocation. Some stride change cases could be improved further by either limiting vertex buffer size somehow (reading the index buffer could help, but is always risky) or only updating regions that changed, rather than invalidating the whole thing. --- src/Ryujinx.Graphics.Vulkan/BufferHolder.cs | 6 +++--- src/Ryujinx.Graphics.Vulkan/HelperShader.cs | 6 +++--- .../ChangeBufferStrideShaderSource.comp | 4 ++-- .../ConvertD32S8ToD24S8ShaderSource.comp | 4 ++-- .../SpirvBinaries/ChangeBufferStride.spv | Bin 3812 -> 3932 bytes .../SpirvBinaries/ConvertD32S8ToD24S8.spv | Bin 3236 -> 3356 bytes 6 files changed, 10 insertions(+), 10 deletions(-) diff --git a/src/Ryujinx.Graphics.Vulkan/BufferHolder.cs b/src/Ryujinx.Graphics.Vulkan/BufferHolder.cs index c767a57a..a93ced0e 100644 --- a/src/Ryujinx.Graphics.Vulkan/BufferHolder.cs +++ b/src/Ryujinx.Graphics.Vulkan/BufferHolder.cs @@ -967,7 +967,7 @@ namespace Ryujinx.Graphics.Vulkan if (!_cachedConvertedBuffers.TryGetValue(offset, size, key, out var holder)) { - holder = _gd.BufferManager.Create(_gd, (size * 2 + 3) & ~3); + holder = _gd.BufferManager.Create(_gd, (size * 2 + 3) & ~3, baseType: BufferAllocationType.DeviceLocal); _gd.PipelineInternal.EndRenderPass(); _gd.HelperShader.ConvertI8ToI16(_gd, cbs, this, holder, offset, size); @@ -993,7 +993,7 @@ namespace Ryujinx.Graphics.Vulkan { int alignedStride = (stride + (alignment - 1)) & -alignment; - holder = _gd.BufferManager.Create(_gd, (size / stride) * alignedStride); + holder = _gd.BufferManager.Create(_gd, (size / stride) * alignedStride, baseType: BufferAllocationType.DeviceLocal); _gd.PipelineInternal.EndRenderPass(); _gd.HelperShader.ChangeStride(_gd, cbs, this, holder, offset, size, stride, alignedStride); @@ -1023,7 +1023,7 @@ namespace Ryujinx.Graphics.Vulkan int convertedCount = pattern.GetConvertedCount(indexCount); - holder = _gd.BufferManager.Create(_gd, convertedCount * 4); + holder = _gd.BufferManager.Create(_gd, convertedCount * 4, baseType: BufferAllocationType.DeviceLocal); _gd.PipelineInternal.EndRenderPass(); _gd.HelperShader.ConvertIndexBuffer(_gd, cbs, this, holder, pattern, indexSize, offset, indexCount); diff --git a/src/Ryujinx.Graphics.Vulkan/HelperShader.cs b/src/Ryujinx.Graphics.Vulkan/HelperShader.cs index a6d23764..5be4a932 100644 --- a/src/Ryujinx.Graphics.Vulkan/HelperShader.cs +++ b/src/Ryujinx.Graphics.Vulkan/HelperShader.cs @@ -5,7 +5,6 @@ using Ryujinx.Graphics.Shader.Translation; using Silk.NET.Vulkan; using System; using System.Collections.Generic; -using System.IO; using System.Numerics; using CompareOp = Ryujinx.Graphics.GAL.CompareOp; using Format = Ryujinx.Graphics.GAL.Format; @@ -27,6 +26,7 @@ namespace Ryujinx.Graphics.Vulkan class HelperShader : IDisposable { private const int UniformBufferAlignment = 256; + private const int ConvertElementsPerWorkgroup = 32 * 100; // Work group size of 32 times 100 elements. private const string ShaderBinariesPath = "Ryujinx.Graphics.Vulkan/Shaders/SpirvBinaries"; private readonly PipelineHelperShader _pipeline; @@ -894,7 +894,7 @@ namespace Ryujinx.Graphics.Vulkan _pipeline.SetStorageBuffers(1, sbRanges); _pipeline.SetProgram(_programStrideChange); - _pipeline.DispatchCompute(1, 1, 1); + _pipeline.DispatchCompute(1 + elems / ConvertElementsPerWorkgroup, 1, 1); gd.BufferManager.Delete(bufferHandle); @@ -1742,7 +1742,7 @@ namespace Ryujinx.Graphics.Vulkan _pipeline.SetStorageBuffers(1, sbRanges); _pipeline.SetProgram(_programConvertD32S8ToD24S8); - _pipeline.DispatchCompute(1, 1, 1); + _pipeline.DispatchCompute(1 + inSize / ConvertElementsPerWorkgroup, 1, 1); gd.BufferManager.Delete(bufferHandle); diff --git a/src/Ryujinx.Graphics.Vulkan/Shaders/ChangeBufferStrideShaderSource.comp b/src/Ryujinx.Graphics.Vulkan/Shaders/ChangeBufferStrideShaderSource.comp index 081fc119..4deba30d 100644 --- a/src/Ryujinx.Graphics.Vulkan/Shaders/ChangeBufferStrideShaderSource.comp +++ b/src/Ryujinx.Graphics.Vulkan/Shaders/ChangeBufferStrideShaderSource.comp @@ -29,7 +29,7 @@ void main() int sourceOffset = stride_arguments_data.w; int strideRemainder = targetStride - sourceStride; - int invocations = int(gl_WorkGroupSize.x); + int invocations = int(gl_WorkGroupSize.x * gl_NumWorkGroups.x); int copiesRequired = bufferSize / sourceStride; @@ -39,7 +39,7 @@ void main() int allInvocationCopies = copiesRequired / invocations; // - Extra remainder copy that this invocation performs. - int index = int(gl_LocalInvocationID.x); + int index = int(gl_GlobalInvocationID.x); int extra = (index < (copiesRequired % invocations)) ? 1 : 0; int copyCount = allInvocationCopies + extra; diff --git a/src/Ryujinx.Graphics.Vulkan/Shaders/ConvertD32S8ToD24S8ShaderSource.comp b/src/Ryujinx.Graphics.Vulkan/Shaders/ConvertD32S8ToD24S8ShaderSource.comp index d3a74b1c..96cbdebb 100644 --- a/src/Ryujinx.Graphics.Vulkan/Shaders/ConvertD32S8ToD24S8ShaderSource.comp +++ b/src/Ryujinx.Graphics.Vulkan/Shaders/ConvertD32S8ToD24S8ShaderSource.comp @@ -23,7 +23,7 @@ layout (std430, set = 1, binding = 2) buffer out_s void main() { // Determine what slice of the stride copies this invocation will perform. - int invocations = int(gl_WorkGroupSize.x); + int invocations = int(gl_WorkGroupSize.x * gl_NumWorkGroups.x); int copiesRequired = pixelCount; @@ -33,7 +33,7 @@ void main() int allInvocationCopies = copiesRequired / invocations; // - Extra remainder copy that this invocation performs. - int index = int(gl_LocalInvocationID.x); + int index = int(gl_GlobalInvocationID.x); int extra = (index < (copiesRequired % invocations)) ? 1 : 0; int copyCount = allInvocationCopies + extra; diff --git a/src/Ryujinx.Graphics.Vulkan/Shaders/SpirvBinaries/ChangeBufferStride.spv b/src/Ryujinx.Graphics.Vulkan/Shaders/SpirvBinaries/ChangeBufferStride.spv index 49e7d23f9a9c463775404cee485a872fd0d636bb..1e0d8810fbc2a6513b0edb32ce596c3aa68bb6ce 100644 GIT binary patch literal 3932 zcmZ9MX>(Ln5QZ-#ldvxW;syjzL0JR^L=j93IG~XTL4%?WlguOxBr{vUtXW! z>r3t8$k@pKR=cwQ(BT6{u1R{*1Y_4GE0O}b7nv>B>N-{-yO1Nub>t?pj#|c0HW@_B zReJa>MgNJ>n=DI;Bc)RpFO*tW%9U!fbabNDF16Z?W_h}*e}(?y=xA}|RB3W<4%hrl zwKP+!S4)*zvpU(X-ALooI$2z;J~`8=tGbdAjZ}KX-daK=BoJAoS#%9g=;YO$KhU~kDZZVWL>&$1|Il1QK!;Kp^ z4p(Wlb6ihy39gpr^dz*OTupIr(m*#lTpV;CKW#s`>}aUilip2v_trtz>S+oYh1Avu zm!=r+J@ykXpk41YmUqO@Gw>_om-Aj1sqG!uf%t{JebG#9-^xnlvCQvWwV%ptA5|a1 z6G_*L{p8n^i}ieSu4laKX}ezdoqfKZd-u!NlZ$xwov){Dy{P{y*nS^JHjriBc4p9n z7H^^l*Be008NlvGEApGMmuI*ay)~PkoUzunjx#-$+0ImM54JT{BYv*620fs`r(b(* zYAx1V2R7Hd@NWd$TiBjO)HmO=a6jRH8a#;D`!i@`^xN;VXlu#aueN6_r+;^b%?ZC} zu74w|zK(Vs`y0x5_caUlf68wQtJTqR&dc-Hw*DO2`N&yc+dCj~82VAl?NJJ|m3atqwo4rBn4H{bJ=8^XUE{T#9jiCOqgp3nGu#Axq9ym#_< zvU%;zcx!qmw4;apVD}dG3t)S?pY?PAY>d45-XXb&e-SJf_Df*bi2OrfW8}^E&dJ4| zUIrgQ@_RapEg${A0+uu1JE|S)9RtgS{Tevm&v9(~d64br1lSn)Kj8dN8APn3?{`kZ zvv>m;M1~OWpKJRb;NvXxi2sQ(yB|{D;(p9?e{#mzzk6{_b*%X#cp;U= w|C67;M1& literal 3812 zcmZ9NX>(Ln5QZ-#ldvxW;syk8Ll8v)Q3S(gKu4nqBCf+Qa}!2~nJ{w)37{aNsHp$L zkNyRJl%-Z#<@4O!rf%h?y87+6m(!=unUkTVJ4chsl4M1)I{7VG_|_*&lO;g)h4o3Y zG8xJ3iRyH9Z$E49-GAUkBiAHDd4jQPljTVTJ&atgw>mmjAqKXQ9v{yTHv6a>OS+`f8P4%zPKQTEuQ9V^_%+2ANZ>P0( ztCQB6tzO#5T37S9yiO6<>NMJeW;vijrE;$odRs;h{qCUGNT;)2t4WoFJ&m#=e9qMB zz1hL#w3GGqjj(5ZevAFJW<9IJSJ<0#@N6HQVXon7A=n7Nu3}@T5Duqi4XHG)r@dwc9wXCFW8 zbO#-qwzuQxJ{!+iXTIR7a=pfb%^5cZ*X(B{ZYVhi*UEE-k{bGp91rKZJK%C)kMJk& zCs&N0ibMJRlXq`(=vF6BA)}Jp`rz^uLcMihbIl9?MzFnw{RlYfoBssjeqy~R!Q+U%??oG<-+o^} zTT9-4wLMcg{rd`R&PG;w4IOn~FMRIf64MRs+OJ+;X89V+Mh z7L{$^3hiUe(7WjWIcM%2UW*)IPTtLj5Pe6PqBGNuoK4`!aqf}xFj${C&Q&{d{C!4_ z_d9YP1M4%#*=tA67I5VF8;G2(V14FzKHA|M1G{$E+rjpKhg;yjb|7Pjy!oD|oY=GU zw-vchgY7BoXTa9FSJZkIY>d45p1oYeKL?fzdpFoMB7YCq70>< zC$HB(jF4j5*Hf9_#-+M0?@h8CL{FB)3hv`8L)L%@vCreZy`$%&(wJR(evA2*EPnT{cXxct#`oQ$FSc88x!^< zco`CVcn@rhe5~<4*gE=Lql%V`8q?r%&9m6v_o(>+*cf^9eIMk+=g_N>e|Qr=MEf?X z|Kcuvg#H-u)Bg!tzdF|b6l{&KKLdMiaaTSE8zcWOHMP$p*0z?|K75m+j|+uQZ66oW z4aCnrnrL&?v2F^sM%Xi8=Nf&?f{l^45A8Z)ZEJ}m_cGWS#r!*9Iq!DNPcGv7V9zAt zGq7C5>yz^h8=$WuJ;ePQXPryvuMm9^cMaS|jB|b0mNU*dI1_tUM=$eW=McSI2g^k- z`sB>NfHvP5So;Rrwe;OAd}`0&8}u#2PycPSx$0Q^Td+04z60Kd#2I`CHb&kv(Eb{+ zwzb5Odly_jgYU8BVjueCt`xKT0qmO?v%3e5*_r45_w7F2ea`8f6#6FyT&d3uy94ff*IS2Ozw47Z z?8>RGEi5lg#YuDO{DrebjJiS^NX$VuOVXoe2x9f4T8~XJy zi2Qmp==hB!*!G{~$>M^j5q6vTfRZxh`z`4NgX$rOwp$G^30hGsW*|MsMp$3$ZmqUD zo3&1>+m00`U)haGH(Kq$k8k_;yQce5`h`8bO@Aki9Zc_D zz1PClnwQ%#Kb!iC?qz9zC+T=8F1aaboo6>|OF2vKoVb{VRA)Dfy8_o~Sc_>~Rh)Xy zaRoOgE|AdME&P5vxtHPF_@a7tFC_agS)$qjk!jV6Ry#_>LI%uP)>h+}Ta!5f{D zundp)V2ZJ+!TaI=Sh_4B7y0e{R+Id2mVZSy`RRfEU~(Gfp7Z(a-1&TPmXGu2^MT1{ z{_kYd@3WF4V&TWtQ$ZU0sCuAYMM4i1;Q?s`9{*D*um7~pWOzt+RdQUI{KSxx{Ug%k zg1?+$-UiP0kIH5de^449IZg`m9@OsjRdmN>_j>K+1M^OBHwmm7VYBHDUVsCF|#P*~Gv<5_ekq zjD$S+epT1>{e}d7Qo>zOoA-yeT5rncJciH8Cgwp_>n+*Dz_XuufwTCxW#{*w7p_Qn zJ9@YvAqL*|T$G(Z-#fzj^IZ}q2Hy6(E4v~Y(KM0wJxQO0dmtXa^?X@2b%~*8W*W|N zU6l=IJzh_J#P;H^NW3_LwlrQxvo&F~(7>$m-Z zG{155pPJPh((@7){tu<`qpkK$+2k<1Ae(owS@=jcG4TH=C%7gdH@UEF?_^4M|h-ljaovhJ=`P$yRozdl^rDPd3=j zuqC@Hv3rYT!`T_|!C9{@+2r_5^|-J561cX+Vmh)p#~)ctA{#E2{GM@L+3dw<_p>cK rl<+3>K)-OrGDFN0?~bfe)KO`TK*$Tt+L|#%uc6{8LH`i-|o}h=bYXHrQy>ft}^8Ix&7{2H~0>_VK*f6 z30KW^qq*FiO0)LV#Y?Xdaln=GKw=KMF;|f;OE$u|gJZws8Ofq#O>#(i@E?}cC9FeA z`ZFy3t3cU}x<)gYyK_58TVWFRf_q7~wH_qlcDJA5--o}kxY%gU1+C4^AkDUuC`jT? z6tv@B)XL(mXg5z07k64ozwHN9l*#Y4qV7l4L)_Www!$p#c2Y4_RdcS@-HfAjDSFUX z{(~chIv7`C?T1qNeTFYLXw$ z^7FeOKRu8iOn$=}wg2U_efs&}EFb&z^MT1{{w>+``?TbUSolfxRFcL%rXJ|OCZUI# z@Q5@5f3EP4$*xN#gvp0*u&(UJWs?j3n*3nS5zf{hkj+Q_gVOj|PyG`)ufO!25PmAx zyb*(VdZ#yd@`JetIP$Ze``{!0bJFeB!e{s`VR|{PYV*>yT*~f!Y0d!d zq-{7~vZk)E9yTl--LwvoF&-mP0EHdd`h<84{r?pJTCfqK{he) zzlh^^^pb=;_5~{)~jPpmt3f-fF!foBbF*C%YzjRMa{zn;3Z3bI))VKPB5= z|Ee&1`Jq_Pe@=*jw>7WJ_V@ROu)n`Ig^7W;H5X-*cU-s5-Y!XoB#W z<8S>Yr=|IAqW{#a%t+5l`0&q3<40Sq_hi>S+>p(^*eu+VO$_`$$_ai~LT++lTdt2} zvtR4`W7%-*)y@KLTBp7wy)5A!rGMhc!@S>?;A7tBr9UY+a`0OQTg^MN{hFT%!}&Gg ztmfykttN3+^9$MdsIejq#z+0HB-9<(WP$HWm>1?4A8&gg`B8I<|DNQoB$R9vduqvs zCqM5E*!FN=Hg{_07R!dSJ>Y}0UcZq|j^9*|^GYOe>k^CEkj*~+C}O&@;X0Dv3vN?3 yYw_9nJdnL6;ZEp*e&L8^hL|Vr9c}&hWY;Cue<~Z!`o{<7&*+TszdFUMlK%jz)Cy_<