ryujinx-final/Ryujinx.Graphics/Graphics3d/NvGpuEngine3d.cs

using Ryujinx.Common;
using Ryujinx.Graphics.Gal;
using Ryujinx.Graphics.Memory;
using Ryujinx.Graphics.Shader;
using Ryujinx.Graphics.Texture;
using System;
using System.Collections.Generic;

namespace Ryujinx.Graphics.Graphics3d
{
    class NvGpuEngine3d : INvGpuEngine
    {
        public int[] Registers { get; private set; }

        private NvGpu _gpu;

        private Dictionary<int, NvGpuMethod> _methods;

        private struct ConstBuffer
        {
            public bool Enabled;
            public long Position;
            public int  Size;
        }

        private ConstBuffer[][] _constBuffers;

        // Viewport dimensions kept for scissor test limits
        private int _viewportX0 = 0;
        private int _viewportY0 = 0;
        private int _viewportX1 = 0;
        private int _viewportY1 = 0;
        private int _viewportWidth = 0;
        private int _viewportHeight = 0;

        private int _currentInstance = 0;

        public NvGpuEngine3d(NvGpu gpu)
        {
            _gpu = gpu;

            Registers = new int[0xe00];

            _methods = new Dictionary<int, NvGpuMethod>();

            void AddMethod(int meth, int count, int stride, NvGpuMethod method)
            {
                while (count-- > 0)
                {
                    _methods.Add(meth, method);

                    meth += stride;
                }
            }

            AddMethod(0x585,  1, 1, VertexEndGl);
            AddMethod(0x674,  1, 1, ClearBuffers);
            AddMethod(0x6c3,  1, 1, QueryControl);
            AddMethod(0x8e4, 16, 1, CbData);
            AddMethod(0x904,  5, 8, CbBind);

            _constBuffers = new ConstBuffer[6][];

            for (int index = 0; index < _constBuffers.Length; index++)
            {
                _constBuffers[index] = new ConstBuffer[18];
            }

            //Ensure that all components are enabled by default.
            //FIXME: Is this correct?
            WriteRegister(NvGpuEngine3dReg.ColorMaskN, 0x1111);

            WriteRegister(NvGpuEngine3dReg.FrameBufferSrgb, 1);

            WriteRegister(NvGpuEngine3dReg.FrontFace, (int)GalFrontFace.Cw);

            for (int index = 0; index < GalPipelineState.RenderTargetsCount; index++)
            {
                WriteRegister(NvGpuEngine3dReg.IBlendNEquationRgb   + index * 8, (int)GalBlendEquation.FuncAdd);
                WriteRegister(NvGpuEngine3dReg.IBlendNFuncSrcRgb    + index * 8, (int)GalBlendFactor.One);
                WriteRegister(NvGpuEngine3dReg.IBlendNFuncDstRgb    + index * 8, (int)GalBlendFactor.Zero);
                WriteRegister(NvGpuEngine3dReg.IBlendNEquationAlpha + index * 8, (int)GalBlendEquation.FuncAdd);
                WriteRegister(NvGpuEngine3dReg.IBlendNFuncSrcAlpha  + index * 8, (int)GalBlendFactor.One);
                WriteRegister(NvGpuEngine3dReg.IBlendNFuncDstAlpha  + index * 8, (int)GalBlendFactor.Zero);
            }
        }

        public void CallMethod(NvGpuVmm vmm, GpuMethodCall methCall)
        {
            if (_methods.TryGetValue(methCall.Method, out NvGpuMethod method))
            {
                method(vmm, methCall);
            }
            else
            {
                WriteRegister(methCall);
            }
        }

        private void VertexEndGl(NvGpuVmm vmm, GpuMethodCall methCall)
        {
            LockCaches();

            GalPipelineState state = new GalPipelineState();

            // Framebuffer must be run configured because viewport dimensions may be used in other methods
            SetFrameBuffer(state);

            for (int fbIndex = 0; fbIndex < 8; fbIndex++)
            {
                SetFrameBuffer(vmm, fbIndex);
            }

            SetFrontFace(state);
            SetCullFace(state);
            SetDepth(state);
            SetStencil(state);
            SetScissor(state);
            SetBlending(state);
            SetColorMask(state);
            SetPrimitiveRestart(state);

            SetZeta(vmm);

            SetRenderTargets();

            long[] keys = UploadShaders(vmm);

            _gpu.Renderer.Shader.BindProgram();

            UploadTextures(vmm, state, keys);
            UploadConstBuffers(vmm, state, keys);
            UploadVertexArrays(vmm, state);

            DispatchRender(vmm, state);

            UnlockCaches();
        }

        private void LockCaches()
        {
            _gpu.Renderer.Buffer.LockCache();
            _gpu.Renderer.Rasterizer.LockCaches();
            _gpu.Renderer.Texture.LockCache();
        }

        private void UnlockCaches()
        {
            _gpu.Renderer.Buffer.UnlockCache();
            _gpu.Renderer.Rasterizer.UnlockCaches();
            _gpu.Renderer.Texture.UnlockCache();
        }

        private void ClearBuffers(NvGpuVmm vmm, GpuMethodCall methCall)
        {
            int attachment = (methCall.Argument >> 6) & 0xf;

            GalClearBufferFlags flags = (GalClearBufferFlags)(methCall.Argument & 0x3f);

            float red   = ReadRegisterFloat(NvGpuEngine3dReg.ClearNColor + 0);
            float green = ReadRegisterFloat(NvGpuEngine3dReg.ClearNColor + 1);
            float blue  = ReadRegisterFloat(NvGpuEngine3dReg.ClearNColor + 2);
            float alpha = ReadRegisterFloat(NvGpuEngine3dReg.ClearNColor + 3);

            float depth = ReadRegisterFloat(NvGpuEngine3dReg.ClearDepth);

            int stencil = ReadRegister(NvGpuEngine3dReg.ClearStencil);

            SetFrameBuffer(vmm, attachment);

            SetZeta(vmm);

            SetRenderTargets();

            _gpu.Renderer.RenderTarget.Bind();

            _gpu.Renderer.Rasterizer.ClearBuffers(flags, attachment, red, green, blue, alpha, depth, stencil);

            _gpu.Renderer.Pipeline.ResetDepthMask();
            _gpu.Renderer.Pipeline.ResetColorMask(attachment);
        }

        private void SetFrameBuffer(NvGpuVmm vmm, int fbIndex)
        {
            long va = MakeInt64From2xInt32(NvGpuEngine3dReg.FrameBufferNAddress + fbIndex * 0x10);

            int surfFormat = ReadRegister(NvGpuEngine3dReg.FrameBufferNFormat + fbIndex * 0x10);

            if (va == 0 || surfFormat == 0)
            {
                _gpu.Renderer.RenderTarget.UnbindColor(fbIndex);

                return;
            }

            long key = vmm.GetPhysicalAddress(va);

            int width  = ReadRegister(NvGpuEngine3dReg.FrameBufferNWidth  + fbIndex * 0x10);
            int height = ReadRegister(NvGpuEngine3dReg.FrameBufferNHeight + fbIndex * 0x10);

            int arrayMode   = ReadRegister(NvGpuEngine3dReg.FrameBufferNArrayMode + fbIndex * 0x10);
            int layerCount  = arrayMode & 0xFFFF;
            int layerStride = ReadRegister(NvGpuEngine3dReg.FrameBufferNLayerStride + fbIndex * 0x10);
            int baseLayer   = ReadRegister(NvGpuEngine3dReg.FrameBufferNBaseLayer + fbIndex * 0x10);
            int blockDim    = ReadRegister(NvGpuEngine3dReg.FrameBufferNBlockDim + fbIndex * 0x10);

            int gobBlockHeight = 1 << ((blockDim >> 4) & 7);

            GalMemoryLayout layout = (GalMemoryLayout)((blockDim >> 12) & 1);

            float tx = ReadRegisterFloat(NvGpuEngine3dReg.ViewportNTranslateX + fbIndex * 8);
            float ty = ReadRegisterFloat(NvGpuEngine3dReg.ViewportNTranslateY + fbIndex * 8);

            float sx = ReadRegisterFloat(NvGpuEngine3dReg.ViewportNScaleX + fbIndex * 8);
            float sy = ReadRegisterFloat(NvGpuEngine3dReg.ViewportNScaleY + fbIndex * 8);

            _viewportX0 = (int)MathF.Max(0, tx - MathF.Abs(sx));
            _viewportY0 = (int)MathF.Max(0, ty - MathF.Abs(sy));

            _viewportX1 = (int)(tx + MathF.Abs(sx));
            _viewportY1 = (int)(ty + MathF.Abs(sy));

            GalImageFormat format = ImageUtils.ConvertSurface((GalSurfaceFormat)surfFormat);

            GalImage image = new GalImage(width, height, 1, 1, 1, gobBlockHeight, 1, layout, format, GalTextureTarget.TwoD);

            _gpu.ResourceManager.SendColorBuffer(vmm, key, fbIndex, image);

            _gpu.Renderer.RenderTarget.SetViewport(fbIndex, _viewportX0, _viewportY0, _viewportX1 - _viewportX0, _viewportY1 - _viewportY0);
        }

        private void SetFrameBuffer(GalPipelineState state)
        {
            state.FramebufferSrgb = ReadRegisterBool(NvGpuEngine3dReg.FrameBufferSrgb);

            state.FlipX = GetFlipSign(NvGpuEngine3dReg.ViewportNScaleX);
            state.FlipY = GetFlipSign(NvGpuEngine3dReg.ViewportNScaleY);

            int screenYControl = ReadRegister(NvGpuEngine3dReg.ScreenYControl);

            bool negateY = (screenYControl & 1) != 0;

            if (negateY)
            {
                state.FlipY = -state.FlipY;
            }
        }

        private void SetZeta(NvGpuVmm vmm)
        {
            long va = MakeInt64From2xInt32(NvGpuEngine3dReg.ZetaAddress);

            int zetaFormat = ReadRegister(NvGpuEngine3dReg.ZetaFormat);

            int blockDim = ReadRegister(NvGpuEngine3dReg.ZetaBlockDimensions);

            int gobBlockHeight = 1 << ((blockDim >> 4) & 7);

            GalMemoryLayout layout = (GalMemoryLayout)((blockDim >> 12) & 1); //?

            bool zetaEnable = ReadRegisterBool(NvGpuEngine3dReg.ZetaEnable);

            if (va == 0 || zetaFormat == 0 || !zetaEnable)
            {
                _gpu.Renderer.RenderTarget.UnbindZeta();

                return;
            }

            long key = vmm.GetPhysicalAddress(va);

            int width  = ReadRegister(NvGpuEngine3dReg.ZetaHoriz);
            int height = ReadRegister(NvGpuEngine3dReg.ZetaVert);

            GalImageFormat format = ImageUtils.ConvertZeta((GalZetaFormat)zetaFormat);

            // TODO: Support non 2D?
            GalImage image = new GalImage(width, height, 1, 1, 1, gobBlockHeight, 1, layout, format, GalTextureTarget.TwoD);

            _gpu.ResourceManager.SendZetaBuffer(vmm, key, image);
        }

        private long[] UploadShaders(NvGpuVmm vmm)
        {
            long[] keys = new long[5];

            long basePosition = MakeInt64From2xInt32(NvGpuEngine3dReg.ShaderAddress);

            int index = 1;

            int vpAControl = ReadRegister(NvGpuEngine3dReg.ShaderNControl);

            bool vpAEnable = (vpAControl & 1) != 0;

            if (vpAEnable)
            {
                //Note: The maxwell supports 2 vertex programs, usually
                //only VP B is used, but in some cases VP A is also used.
                //In this case, it seems to function as an extra vertex
                //shader stage.
                //The graphics abstraction layer has a special overload for this
                //case, which should merge the two shaders into one vertex shader.
                int vpAOffset = ReadRegister(NvGpuEngine3dReg.ShaderNOffset);
                int vpBOffset = ReadRegister(NvGpuEngine3dReg.ShaderNOffset + 0x10);

                long vpAPos = basePosition + (uint)vpAOffset;
                long vpBPos = basePosition + (uint)vpBOffset;

                keys[(int)GalShaderType.Vertex] = vpBPos;

                _gpu.Renderer.Shader.Create(vmm, vpAPos, vpBPos, GalShaderType.Vertex);
                _gpu.Renderer.Shader.Bind(vpBPos);

                index = 2;
            }

            for (; index < 6; index++)
            {
                GalShaderType type = GetTypeFromProgram(index);

                int control = ReadRegister(NvGpuEngine3dReg.ShaderNControl + index * 0x10);
                int offset  = ReadRegister(NvGpuEngine3dReg.ShaderNOffset  + index * 0x10);

                //Note: Vertex Program (B) is always enabled.
                bool enable = (control & 1) != 0 || index == 1;

                if (!enable)
                {
                    _gpu.Renderer.Shader.Unbind(type);

                    continue;
                }

                long key = basePosition + (uint)offset;

                keys[(int)type] = key;

                _gpu.Renderer.Shader.Create(vmm, key, type);
                _gpu.Renderer.Shader.Bind(key);
            }

            return keys;
        }

        private static GalShaderType GetTypeFromProgram(int program)
        {
            switch (program)
            {
                case 0:
                case 1: return GalShaderType.Vertex;
                case 2: return GalShaderType.TessControl;
                case 3: return GalShaderType.TessEvaluation;
                case 4: return GalShaderType.Geometry;
                case 5: return GalShaderType.Fragment;
            }

            throw new ArgumentOutOfRangeException(nameof(program));
        }

        private void SetFrontFace(GalPipelineState state)
        {
            float signX = GetFlipSign(NvGpuEngine3dReg.ViewportNScaleX);
            float signY = GetFlipSign(NvGpuEngine3dReg.ViewportNScaleY);

            GalFrontFace frontFace = (GalFrontFace)ReadRegister(NvGpuEngine3dReg.FrontFace);

            //Flipping breaks facing. Flipping front facing too fixes it
            if (signX != signY)
            {
                switch (frontFace)
                {
                    case GalFrontFace.Cw:  frontFace = GalFrontFace.Ccw; break;
                    case GalFrontFace.Ccw: frontFace = GalFrontFace.Cw;  break;
                }
            }

            state.FrontFace = frontFace;
        }

        private void SetCullFace(GalPipelineState state)
        {
            state.CullFaceEnabled = ReadRegisterBool(NvGpuEngine3dReg.CullFaceEnable);

            if (state.CullFaceEnabled)
            {
                state.CullFace = (GalCullFace)ReadRegister(NvGpuEngine3dReg.CullFace);
            }
        }

        private void SetDepth(GalPipelineState state)
        {
            state.DepthTestEnabled = ReadRegisterBool(NvGpuEngine3dReg.DepthTestEnable);

            state.DepthWriteEnabled = ReadRegisterBool(NvGpuEngine3dReg.DepthWriteEnable);

            if (state.DepthTestEnabled)
            {
                state.DepthFunc = (GalComparisonOp)ReadRegister(NvGpuEngine3dReg.DepthTestFunction);
            }

            state.DepthRangeNear = ReadRegisterFloat(NvGpuEngine3dReg.DepthRangeNNear);
            state.DepthRangeFar  = ReadRegisterFloat(NvGpuEngine3dReg.DepthRangeNFar);
        }

        private void SetStencil(GalPipelineState state)
        {
            state.StencilTestEnabled = ReadRegisterBool(NvGpuEngine3dReg.StencilEnable);

            if (state.StencilTestEnabled)
            {
                state.StencilBackFuncFunc = (GalComparisonOp)ReadRegister(NvGpuEngine3dReg.StencilBackFuncFunc);
                state.StencilBackFuncRef  =                  ReadRegister(NvGpuEngine3dReg.StencilBackFuncRef);
                state.StencilBackFuncMask =            (uint)ReadRegister(NvGpuEngine3dReg.StencilBackFuncMask);
                state.StencilBackOpFail   =    (GalStencilOp)ReadRegister(NvGpuEngine3dReg.StencilBackOpFail);
                state.StencilBackOpZFail  =    (GalStencilOp)ReadRegister(NvGpuEngine3dReg.StencilBackOpZFail);
                state.StencilBackOpZPass  =    (GalStencilOp)ReadRegister(NvGpuEngine3dReg.StencilBackOpZPass);
                state.StencilBackMask     =            (uint)ReadRegister(NvGpuEngine3dReg.StencilBackMask);

                state.StencilFrontFuncFunc = (GalComparisonOp)ReadRegister(NvGpuEngine3dReg.StencilFrontFuncFunc);
                state.StencilFrontFuncRef  =                  ReadRegister(NvGpuEngine3dReg.StencilFrontFuncRef);
                state.StencilFrontFuncMask =            (uint)ReadRegister(NvGpuEngine3dReg.StencilFrontFuncMask);
                state.StencilFrontOpFail   =    (GalStencilOp)ReadRegister(NvGpuEngine3dReg.StencilFrontOpFail);
                state.StencilFrontOpZFail  =    (GalStencilOp)ReadRegister(NvGpuEngine3dReg.StencilFrontOpZFail);
                state.StencilFrontOpZPass  =    (GalStencilOp)ReadRegister(NvGpuEngine3dReg.StencilFrontOpZPass);
                state.StencilFrontMask     =            (uint)ReadRegister(NvGpuEngine3dReg.StencilFrontMask);
            }
        }

        private void SetScissor(GalPipelineState state)
        {
            int count = 0;

            for (int index = 0; index < GalPipelineState.RenderTargetsCount; index++)
            {
                state.ScissorTestEnabled[index] = ReadRegisterBool(NvGpuEngine3dReg.ScissorEnable + index * 4);

                if (state.ScissorTestEnabled[index])
                {
                    uint scissorHorizontal = (uint)ReadRegister(NvGpuEngine3dReg.ScissorHorizontal + index * 4);
                    uint scissorVertical   = (uint)ReadRegister(NvGpuEngine3dReg.ScissorVertical   + index * 4);

                    int left  = (int)(scissorHorizontal & 0xFFFF); // Left, lower 16 bits
                    int right = (int)(scissorHorizontal >> 16);    // Right, upper 16 bits

                    int bottom = (int)(scissorVertical & 0xFFFF); // Bottom, lower 16 bits
                    int top    = (int)(scissorVertical >> 16);    // Top, upper 16 bits

                    int width  = Math.Abs(right - left);
                    int height = Math.Abs(top   - bottom);

                    // If the scissor test covers the whole possible viewport, i.e. uninitialized, disable scissor test
                    if ((width > NvGpu.MaxViewportSize && height > NvGpu.MaxViewportSize) || width <= 0 || height <= 0)
                    {
                        state.ScissorTestEnabled[index] = false;
                        continue;
                    }

                    // Keep track of how many scissor tests are active.
                    // If only 1, and it's the first user should apply to all viewports
                    count++;

                    // Flip X
                    if (state.FlipX == -1)
                    {
                        left  = _viewportX1 - (left  - _viewportX0);
                        right = _viewportX1 - (right - _viewportX0);
                    }

                    // Ensure X is in the right order
                    if (left > right)
                    {
                        int temp = left;
                        left     = right;
                        right    = temp;
                    }

                    // Flip Y
                    if (state.FlipY == -1)
                    {
                        bottom = _viewportY1 - (bottom - _viewportY0);
                        top    = _viewportY1 - (top - _viewportY0);
                    }

                    // Ensure Y is in the right order
                    if (bottom > top)
                    {
                        int temp = top;
                        top      = bottom;
                        bottom   = temp;
                    }

                    // Handle out of active viewport dimensions
                    left   = Math.Clamp(left,   _viewportX0, _viewportX1);
                    right  = Math.Clamp(right,  _viewportX0, _viewportX1);
                    top    = Math.Clamp(top,    _viewportY0, _viewportY1);
                    bottom = Math.Clamp(bottom, _viewportY0, _viewportY1);

                    // Save values to state
                    state.ScissorTestX[index] = left;
                    state.ScissorTestY[index] = bottom;

                    state.ScissorTestWidth[index]  = right - left;
                    state.ScissorTestHeight[index] = top - bottom;
                }
            }

            state.ScissorTestCount = count;
        }

        private void SetBlending(GalPipelineState state)
        {
            bool blendIndependent = ReadRegisterBool(NvGpuEngine3dReg.BlendIndependent);

            state.BlendIndependent = blendIndependent;

            for (int index = 0; index < GalPipelineState.RenderTargetsCount; index++)
            {
                if (blendIndependent)
                {
                    state.Blends[index].Enabled = ReadRegisterBool(NvGpuEngine3dReg.IBlendNEnable + index);

                    if (state.Blends[index].Enabled)
                    {
                        state.Blends[index].SeparateAlpha = ReadRegisterBool(NvGpuEngine3dReg.IBlendNSeparateAlpha + index * 8);

                        state.Blends[index].EquationRgb   = ReadBlendEquation(NvGpuEngine3dReg.IBlendNEquationRgb   + index * 8);
                        state.Blends[index].FuncSrcRgb    = ReadBlendFactor  (NvGpuEngine3dReg.IBlendNFuncSrcRgb    + index * 8);
                        state.Blends[index].FuncDstRgb    = ReadBlendFactor  (NvGpuEngine3dReg.IBlendNFuncDstRgb    + index * 8);
                        state.Blends[index].EquationAlpha = ReadBlendEquation(NvGpuEngine3dReg.IBlendNEquationAlpha + index * 8);
                        state.Blends[index].FuncSrcAlpha  = ReadBlendFactor  (NvGpuEngine3dReg.IBlendNFuncSrcAlpha  + index * 8);
                        state.Blends[index].FuncDstAlpha  = ReadBlendFactor  (NvGpuEngine3dReg.IBlendNFuncDstAlpha  + index * 8);
                    }
                }
                else
                {
                    //It seems that even when independent blend is disabled, the first IBlend enable
                    //register is still set to indicate whenever blend is enabled or not (?).
                    state.Blends[index].Enabled = ReadRegisterBool(NvGpuEngine3dReg.IBlendNEnable);

                    if (state.Blends[index].Enabled)
                    {
                        state.Blends[index].SeparateAlpha = ReadRegisterBool(NvGpuEngine3dReg.BlendSeparateAlpha);

                        state.Blends[index].EquationRgb   = ReadBlendEquation(NvGpuEngine3dReg.BlendEquationRgb);
                        state.Blends[index].FuncSrcRgb    = ReadBlendFactor  (NvGpuEngine3dReg.BlendFuncSrcRgb);
                        state.Blends[index].FuncDstRgb    = ReadBlendFactor  (NvGpuEngine3dReg.BlendFuncDstRgb);
                        state.Blends[index].EquationAlpha = ReadBlendEquation(NvGpuEngine3dReg.BlendEquationAlpha);
                        state.Blends[index].FuncSrcAlpha  = ReadBlendFactor  (NvGpuEngine3dReg.BlendFuncSrcAlpha);
                        state.Blends[index].FuncDstAlpha  = ReadBlendFactor  (NvGpuEngine3dReg.BlendFuncDstAlpha);
                    }
                }
            }
        }

        private GalBlendEquation ReadBlendEquation(NvGpuEngine3dReg register)
        {
            return (GalBlendEquation)ReadRegister(register);
        }

        private GalBlendFactor ReadBlendFactor(NvGpuEngine3dReg register)
        {
            return (GalBlendFactor)ReadRegister(register);
        }

        private void SetColorMask(GalPipelineState state)
        {
            bool colorMaskCommon = ReadRegisterBool(NvGpuEngine3dReg.ColorMaskCommon);

            state.ColorMaskCommon = colorMaskCommon;

            for (int index = 0; index < GalPipelineState.RenderTargetsCount; index++)
            {
                int colorMask = ReadRegister(NvGpuEngine3dReg.ColorMaskN + (colorMaskCommon ? 0 : index));

                state.ColorMasks[index].Red   = ((colorMask >> 0)  & 0xf) != 0;
                state.ColorMasks[index].Green = ((colorMask >> 4)  & 0xf) != 0;
                state.ColorMasks[index].Blue  = ((colorMask >> 8)  & 0xf) != 0;
                state.ColorMasks[index].Alpha = ((colorMask >> 12) & 0xf) != 0;
            }
        }

        private void SetPrimitiveRestart(GalPipelineState state)
        {
            state.PrimitiveRestartEnabled = ReadRegisterBool(NvGpuEngine3dReg.PrimRestartEnable);

            if (state.PrimitiveRestartEnabled)
            {
                state.PrimitiveRestartIndex = (uint)ReadRegister(NvGpuEngine3dReg.PrimRestartIndex);
            }
        }

        private void SetRenderTargets()
        {
            //Commercial games do not seem to
            //bool SeparateFragData = ReadRegisterBool(NvGpuEngine3dReg.RTSeparateFragData);

            uint control = (uint)(ReadRegister(NvGpuEngine3dReg.RtControl));

            uint count = control & 0xf;

            if (count > 0)
            {
                int[] map = new int[count];

                for (int index = 0; index < count; index++)
                {
                    int shift = 4 + index * 3;

                    map[index] = (int)((control >> shift) & 7);
                }

                _gpu.Renderer.RenderTarget.SetMap(map);
            }
            else
            {
                _gpu.Renderer.RenderTarget.SetMap(null);
            }
        }

        private void UploadTextures(NvGpuVmm vmm, GalPipelineState state, long[] keys)
        {
            long baseShPosition = MakeInt64From2xInt32(NvGpuEngine3dReg.ShaderAddress);

            int textureCbIndex = ReadRegister(NvGpuEngine3dReg.TextureCbIndex);

            List<(long, GalImage, GalTextureSampler)> unboundTextures = new List<(long, GalImage, GalTextureSampler)>();

            for (int index = 0; index < keys.Length; index++)
            {
                foreach (TextureDescriptor desc in _gpu.Renderer.Shader.GetTextureUsage(keys[index]))
                {
                    int textureHandle;

                    if (desc.IsBindless)
                    {
                        long position = _constBuffers[index][desc.CbufSlot].Position;

                        textureHandle = vmm.ReadInt32(position + desc.CbufOffset * 4);
                    }
                    else
                    {
                        long position = _constBuffers[index][textureCbIndex].Position;

                        textureHandle = vmm.ReadInt32(position + desc.HandleIndex * 4);
                    }

                    unboundTextures.Add(UploadTexture(vmm, textureHandle));
                }
            }

            for (int index = 0; index < unboundTextures.Count; index++)
            {
                (long key, GalImage image, GalTextureSampler sampler) = unboundTextures[index];

                if (key == 0)
                {
                    continue;
                }

                _gpu.Renderer.Texture.Bind(key, index, image);
                _gpu.Renderer.Texture.SetSampler(image, sampler);
            }
        }

        private (long, GalImage, GalTextureSampler) UploadTexture(NvGpuVmm vmm, int textureHandle)
        {
            if (textureHandle == 0)
            {
                //FIXME: Some games like puyo puyo will use handles with the value 0.
                //This is a bug, most likely caused by sync issues.
                return (0, default(GalImage), default(GalTextureSampler));
            }

            bool linkedTsc = ReadRegisterBool(NvGpuEngine3dReg.LinkedTsc);

            int ticIndex = (textureHandle >>  0) & 0xfffff;

            int tscIndex = linkedTsc ? ticIndex : (textureHandle >> 20) & 0xfff;

            long ticPosition = MakeInt64From2xInt32(NvGpuEngine3dReg.TexHeaderPoolOffset);
            long tscPosition = MakeInt64From2xInt32(NvGpuEngine3dReg.TexSamplerPoolOffset);

            ticPosition += ticIndex * 0x20;
            tscPosition += tscIndex * 0x20;

            GalImage image = TextureFactory.MakeTexture(vmm, ticPosition);

            GalTextureSampler sampler = TextureFactory.MakeSampler(_gpu, vmm, tscPosition);

            long key = vmm.ReadInt64(ticPosition + 4) & 0xffffffffffff;

            if (image.Layout == GalMemoryLayout.BlockLinear)
            {
                key &= ~0x1ffL;
            }
            else if (image.Layout == GalMemoryLayout.Pitch)
            {
                key &= ~0x1fL;
            }

            key = vmm.GetPhysicalAddress(key);

            if (key == -1)
            {
                //FIXME: Shouldn't ignore invalid addresses.
                return (0, default(GalImage), default(GalTextureSampler));
            }

            _gpu.ResourceManager.SendTexture(vmm, key, image);

            return (key, image, sampler);
        }

        private void UploadConstBuffers(NvGpuVmm vmm, GalPipelineState state, long[] keys)
        {
            for (int stage = 0; stage < keys.Length; stage++)
            {
                foreach (CBufferDescriptor desc in _gpu.Renderer.Shader.GetConstBufferUsage(keys[stage]))
                {
                    ConstBuffer cb = _constBuffers[stage][desc.Slot];

                    if (!cb.Enabled)
                    {
                        continue;
                    }

                    long key = vmm.GetPhysicalAddress(cb.Position);

                    if (_gpu.ResourceManager.MemoryRegionModified(vmm, key, cb.Size, NvGpuBufferType.ConstBuffer))
                    {
                        if (vmm.TryGetHostAddress(cb.Position, cb.Size, out IntPtr cbPtr))
                        {
                            _gpu.Renderer.Buffer.SetData(key, cb.Size, cbPtr);
                        }
                        else
                        {
                            _gpu.Renderer.Buffer.SetData(key, vmm.ReadBytes(cb.Position, cb.Size));
                        }
                    }

                    state.ConstBufferKeys[stage][desc.Slot] = key;
                }
            }
        }

        private void UploadVertexArrays(NvGpuVmm vmm, GalPipelineState state)
        {
            long ibPosition = MakeInt64From2xInt32(NvGpuEngine3dReg.IndexArrayAddress);

            long iboKey = vmm.GetPhysicalAddress(ibPosition);

            int indexEntryFmt = ReadRegister(NvGpuEngine3dReg.IndexArrayFormat);
            int indexCount    = ReadRegister(NvGpuEngine3dReg.IndexBatchCount);
            int primCtrl      = ReadRegister(NvGpuEngine3dReg.VertexBeginGl);

            GalPrimitiveType primType = (GalPrimitiveType)(primCtrl & 0xffff);

            GalIndexFormat indexFormat = (GalIndexFormat)indexEntryFmt;

            int indexEntrySize = 1 << indexEntryFmt;

            if (indexEntrySize > 4)
            {
                throw new InvalidOperationException("Invalid index entry size \"" + indexEntrySize + "\"!");
            }

            if (indexCount != 0)
            {
                int ibSize = indexCount * indexEntrySize;

                bool iboCached = _gpu.Renderer.Rasterizer.IsIboCached(iboKey, (uint)ibSize);

                bool usesLegacyQuads =
                    primType == GalPrimitiveType.Quads ||
                    primType == GalPrimitiveType.QuadStrip;

                if (!iboCached || _gpu.ResourceManager.MemoryRegionModified(vmm, iboKey, (uint)ibSize, NvGpuBufferType.Index))
                {
                    if (!usesLegacyQuads)
                    {
                        if (vmm.TryGetHostAddress(ibPosition, ibSize, out IntPtr ibPtr))
                        {
                            _gpu.Renderer.Rasterizer.CreateIbo(iboKey, ibSize, ibPtr);
                        }
                        else
                        {
                            _gpu.Renderer.Rasterizer.CreateIbo(iboKey, ibSize, vmm.ReadBytes(ibPosition, ibSize));
                        }
                    }
                    else
                    {
                        byte[] buffer = vmm.ReadBytes(ibPosition, ibSize);

                        if (primType == GalPrimitiveType.Quads)
                        {
                            buffer = QuadHelper.ConvertQuadsToTris(buffer, indexEntrySize, indexCount);
                        }
                        else /* if (PrimType == GalPrimitiveType.QuadStrip) */
                        {
                            buffer = QuadHelper.ConvertQuadStripToTris(buffer, indexEntrySize, indexCount);
                        }

                        _gpu.Renderer.Rasterizer.CreateIbo(iboKey, ibSize, buffer);
                    }
                }

                if (!usesLegacyQuads)
                {
                    _gpu.Renderer.Rasterizer.SetIndexArray(ibSize, indexFormat);
                }
                else
                {
                    if (primType == GalPrimitiveType.Quads)
                    {
                        _gpu.Renderer.Rasterizer.SetIndexArray(QuadHelper.ConvertSizeQuadsToTris(ibSize), indexFormat);
                    }
                    else /* if (PrimType == GalPrimitiveType.QuadStrip) */
                    {
                        _gpu.Renderer.Rasterizer.SetIndexArray(QuadHelper.ConvertSizeQuadStripToTris(ibSize), indexFormat);
                    }
                }
            }

            List<GalVertexAttrib>[] attribs = new List<GalVertexAttrib>[32];

            for (int attr = 0; attr < 16; attr++)
            {
                int packed = ReadRegister(NvGpuEngine3dReg.VertexAttribNFormat + attr);

                int arrayIndex = packed & 0x1f;

                if (attribs[arrayIndex] == null)
                {
                    attribs[arrayIndex] = new List<GalVertexAttrib>();
                }

                long vbPosition = MakeInt64From2xInt32(NvGpuEngine3dReg.VertexArrayNAddress + arrayIndex * 4);

                if (vbPosition == 0)
                {
                    continue;
                }

                bool isConst = ((packed >> 6) & 1) != 0;

                int offset = (packed >> 7) & 0x3fff;

                GalVertexAttribSize size = (GalVertexAttribSize)((packed >> 21) & 0x3f);
                GalVertexAttribType type = (GalVertexAttribType)((packed >> 27) & 0x7);

                bool isRgba = ((packed >> 31) & 1) != 0;

                // Check vertex array is enabled to avoid out of bounds exception when reading bytes
                bool enable = (ReadRegister(NvGpuEngine3dReg.VertexArrayNControl + arrayIndex * 4) & 0x1000) != 0;

                //Note: 16 is the maximum size of an attribute,
                //having a component size of 32-bits with 4 elements (a vec4).
                if (enable)
                {
                    byte[] data = vmm.ReadBytes(vbPosition + offset, 16);

                    attribs[arrayIndex].Add(new GalVertexAttrib(attr, isConst, offset, data, size, type, isRgba));
                }
            }

            state.VertexBindings = new GalVertexBinding[32];

            for (int index = 0; index < 32; index++)
            {
                if (attribs[index] == null)
                {
                    continue;
                }

                int control = ReadRegister(NvGpuEngine3dReg.VertexArrayNControl + index * 4);

                bool enable = (control & 0x1000) != 0;

                if (!enable)
                {
                    continue;
                }

                long vbPosition = MakeInt64From2xInt32(NvGpuEngine3dReg.VertexArrayNAddress + index * 4);
                long vbEndPos   = MakeInt64From2xInt32(NvGpuEngine3dReg.VertexArrayNEndAddr + index * 2);

                int vertexDivisor = ReadRegister(NvGpuEngine3dReg.VertexArrayNDivisor + index * 4);

                bool instanced = ReadRegisterBool(NvGpuEngine3dReg.VertexArrayNInstance + index);

                int stride = control & 0xfff;

                if (instanced && vertexDivisor != 0)
                {
                    vbPosition += stride * (_currentInstance / vertexDivisor);
                }

                if (vbPosition > vbEndPos)
                {
                    //Instance is invalid, ignore the draw call
                    continue;
                }

                long vboKey = vmm.GetPhysicalAddress(vbPosition);

                long vbSize = (vbEndPos - vbPosition) + 1;
                int modifiedVbSize = (int)vbSize;


                // If quads convert size to triangle length
                if (stride == 0)
                {
                    if (primType == GalPrimitiveType.Quads)
                    {
                        modifiedVbSize = QuadHelper.ConvertSizeQuadsToTris(modifiedVbSize);
                    }
                    else if (primType == GalPrimitiveType.QuadStrip)
                    {
                        modifiedVbSize = QuadHelper.ConvertSizeQuadStripToTris(modifiedVbSize);
                    }
                }

                bool vboCached = _gpu.Renderer.Rasterizer.IsVboCached(vboKey, modifiedVbSize);

                if (!vboCached || _gpu.ResourceManager.MemoryRegionModified(vmm, vboKey, vbSize, NvGpuBufferType.Vertex))
                {
                    if ((primType == GalPrimitiveType.Quads | primType == GalPrimitiveType.QuadStrip) && stride != 0)
                    {
                        // Convert quad buffer to triangles
                        byte[] data = vmm.ReadBytes(vbPosition, vbSize);

                        if (primType == GalPrimitiveType.Quads)
                        {
                            data = QuadHelper.ConvertQuadsToTris(data, stride, (int)(vbSize / stride));
                        }
                        else
                        {
                            data = QuadHelper.ConvertQuadStripToTris(data, stride, (int)(vbSize / stride));
                        }
                        _gpu.Renderer.Rasterizer.CreateVbo(vboKey, data);
                    }
                    else if (vmm.TryGetHostAddress(vbPosition, vbSize, out IntPtr vbPtr))
                    {
                        _gpu.Renderer.Rasterizer.CreateVbo(vboKey, (int)vbSize, vbPtr);
                    }
                    else
                    {
                        _gpu.Renderer.Rasterizer.CreateVbo(vboKey, vmm.ReadBytes(vbPosition, vbSize));
                    }
                }

                state.VertexBindings[index].Enabled   = true;
                state.VertexBindings[index].Stride    = stride;
                state.VertexBindings[index].VboKey    = vboKey;
                state.VertexBindings[index].Instanced = instanced;
                state.VertexBindings[index].Divisor   = vertexDivisor;
                state.VertexBindings[index].Attribs   = attribs[index].ToArray();
            }
        }

        private void DispatchRender(NvGpuVmm vmm, GalPipelineState state)
        {
            int indexCount = ReadRegister(NvGpuEngine3dReg.IndexBatchCount);
            int primCtrl   = ReadRegister(NvGpuEngine3dReg.VertexBeginGl);

            GalPrimitiveType primType = (GalPrimitiveType)(primCtrl & 0xffff);

            bool instanceNext = ((primCtrl >> 26) & 1) != 0;
            bool instanceCont = ((primCtrl >> 27) & 1) != 0;

            if (instanceNext && instanceCont)
            {
                throw new InvalidOperationException("GPU tried to increase and reset instance count at the same time");
            }

            if (instanceNext)
            {
                _currentInstance++;
            }
            else if (!instanceCont)
            {
                _currentInstance = 0;
            }

            state.Instance = _currentInstance;

            _gpu.Renderer.Pipeline.Bind(state);

            _gpu.Renderer.RenderTarget.Bind();

            if (indexCount != 0)
            {
                int indexEntryFmt = ReadRegister(NvGpuEngine3dReg.IndexArrayFormat);
                int indexFirst    = ReadRegister(NvGpuEngine3dReg.IndexBatchFirst);
                int vertexBase    = ReadRegister(NvGpuEngine3dReg.VertexArrayElemBase);

                long indexPosition = MakeInt64From2xInt32(NvGpuEngine3dReg.IndexArrayAddress);

                long iboKey = vmm.GetPhysicalAddress(indexPosition);

                //Quad primitive types were deprecated on OpenGL 3.x,
                //they are converted to a triangles index buffer on IB creation,
                //so we should use the triangles type here too.
                if (primType == GalPrimitiveType.Quads || primType == GalPrimitiveType.QuadStrip)
                {
                    //Note: We assume that index first points to the first
                    //vertex of a quad, if it points to the middle of a
                    //quad (First % 4 != 0 for Quads) then it will not work properly.
                    if (primType == GalPrimitiveType.Quads)
                    {
                        indexFirst = QuadHelper.ConvertSizeQuadsToTris(indexFirst);
                    }
                    else // QuadStrip
                    {
                        indexFirst = QuadHelper.ConvertSizeQuadStripToTris(indexFirst);
                    }

                    primType = GalPrimitiveType.Triangles;
                }

                _gpu.Renderer.Rasterizer.DrawElements(iboKey, indexFirst, vertexBase, primType);
            }
            else
            {
                int vertexFirst = ReadRegister(NvGpuEngine3dReg.VertexArrayFirst);
                int vertexCount = ReadRegister(NvGpuEngine3dReg.VertexArrayCount);

                //Quad primitive types were deprecated on OpenGL 3.x,
                //they are converted to a triangles index buffer on IB creation,
                //so we should use the triangles type here too.
                if (primType == GalPrimitiveType.Quads || primType == GalPrimitiveType.QuadStrip)
                {
                    //Note: We assume that index first points to the first
                    //vertex of a quad, if it points to the middle of a
                    //quad (First % 4 != 0 for Quads) then it will not work properly.
                    if (primType == GalPrimitiveType.Quads)
                    {
                        vertexFirst = QuadHelper.ConvertSizeQuadsToTris(vertexFirst);
                    }
                    else // QuadStrip
                    {
                        vertexFirst = QuadHelper.ConvertSizeQuadStripToTris(vertexFirst);
                    }

                    primType = GalPrimitiveType.Triangles;
                    vertexCount = QuadHelper.ConvertSizeQuadsToTris(vertexCount);
                }

                _gpu.Renderer.Rasterizer.DrawArrays(vertexFirst, vertexCount, primType);
            }

            // Reset pipeline for host OpenGL calls
            _gpu.Renderer.Pipeline.Unbind(state);

            //Is the GPU really clearing those registers after draw?
            WriteRegister(NvGpuEngine3dReg.IndexBatchFirst, 0);
            WriteRegister(NvGpuEngine3dReg.IndexBatchCount, 0);
        }

        private enum QueryMode
        {
            WriteSeq,
            Sync,
            WriteCounterAndTimestamp
        }

        private void QueryControl(NvGpuVmm vmm, GpuMethodCall methCall)
        {
            WriteRegister(methCall);

            long position = MakeInt64From2xInt32(NvGpuEngine3dReg.QueryAddress);

            int seq  = Registers[(int)NvGpuEngine3dReg.QuerySequence];
            int ctrl = Registers[(int)NvGpuEngine3dReg.QueryControl];

            QueryMode mode = (QueryMode)(ctrl & 3);

            switch (mode)
            {
                case QueryMode.WriteSeq: vmm.WriteInt32(position, seq); break;

                case QueryMode.WriteCounterAndTimestamp:
                {
                    //TODO: Implement counters.
                    long counter = 1;

                    long timestamp = PerformanceCounter.ElapsedMilliseconds;

                    vmm.WriteInt64(position + 0, counter);
                    vmm.WriteInt64(position + 8, timestamp);

                    break;
                }
            }
        }

        private void CbData(NvGpuVmm vmm, GpuMethodCall methCall)
        {
            long position = MakeInt64From2xInt32(NvGpuEngine3dReg.ConstBufferAddress);

            int offset = ReadRegister(NvGpuEngine3dReg.ConstBufferOffset);

            vmm.WriteInt32(position + offset, methCall.Argument);

            WriteRegister(NvGpuEngine3dReg.ConstBufferOffset, offset + 4);

            _gpu.ResourceManager.ClearPbCache(NvGpuBufferType.ConstBuffer);
        }

        private void CbBind(NvGpuVmm vmm, GpuMethodCall methCall)
        {
            int stage = (methCall.Method - 0x904) >> 3;

            int index = methCall.Argument;

            bool enabled = (index & 1) != 0;

            index = (index >> 4) & 0x1f;

            long position = MakeInt64From2xInt32(NvGpuEngine3dReg.ConstBufferAddress);

            long cbKey = vmm.GetPhysicalAddress(position);

            int size = ReadRegister(NvGpuEngine3dReg.ConstBufferSize);

            if (!_gpu.Renderer.Buffer.IsCached(cbKey, size))
            {
                _gpu.Renderer.Buffer.Create(cbKey, size);
            }

            ConstBuffer cb = _constBuffers[stage][index];

            if (cb.Position != position || cb.Enabled != enabled || cb.Size != size)
            {
                _constBuffers[stage][index].Position = position;
                _constBuffers[stage][index].Enabled = enabled;
                _constBuffers[stage][index].Size = size;
            }
        }

        private float GetFlipSign(NvGpuEngine3dReg reg)
        {
            return MathF.Sign(ReadRegisterFloat(reg));
        }

        private long MakeInt64From2xInt32(NvGpuEngine3dReg reg)
        {
            return
                (long)Registers[(int)reg + 0] << 32 |
                (uint)Registers[(int)reg + 1];
        }

        private void WriteRegister(GpuMethodCall methCall)
        {
            Registers[methCall.Method] = methCall.Argument;
        }

        private int ReadRegister(NvGpuEngine3dReg reg)
        {
            return Registers[(int)reg];
        }

        private float ReadRegisterFloat(NvGpuEngine3dReg reg)
        {
            return BitConverter.Int32BitsToSingle(ReadRegister(reg));
        }

        private bool ReadRegisterBool(NvGpuEngine3dReg reg)
        {
            return (ReadRegister(reg) & 1) != 0;
        }

        private void WriteRegister(NvGpuEngine3dReg reg, int value)
        {
            Registers[(int)reg] = value;
        }
    }
}