From 9e450f7d41762be03e8f9af258cd59e267604e38 Mon Sep 17 00:00:00 2001 From: pineappleEA Date: Sat, 2 Oct 2021 08:41:27 +0200 Subject: [PATCH] early-access version 2097 --- README.md | 2 +- .../service/nvdrv/devices/nvdisp_disp0.cpp | 5 +- .../hle/service/nvdrv/devices/nvhost_ctrl.cpp | 30 +- .../hle/service/nvdrv/devices/nvhost_gpu.cpp | 13 +- src/core/hle/service/nvflinger/nvflinger.cpp | 29 +- src/core/hle/service/nvflinger/nvflinger.h | 11 +- src/video_core/cdma_pusher.cpp | 1 + src/video_core/cdma_pusher.h | 2 +- src/video_core/engines/maxwell_3d.h | 1 + src/video_core/framebuffer_config.h | 20 +- src/video_core/gpu.cpp | 1359 +++++++++++------ src/video_core/gpu.h | 228 +-- src/video_core/gpu_thread.h | 3 - src/video_core/query_cache.h | 1 + src/video_core/shader_environment.cpp | 1 + src/video_core/shader_environment.h | 4 +- .../texture_cache/texture_cache_base.h | 2 +- 17 files changed, 954 insertions(+), 758 deletions(-) diff --git a/README.md b/README.md index 4866785e7..3e54ae0da 100755 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ yuzu emulator early access ============= -This is the source code for early-access 2096. +This is the source code for early-access 2097. ## Legal Notice diff --git a/src/core/hle/service/nvdrv/devices/nvdisp_disp0.cpp b/src/core/hle/service/nvdrv/devices/nvdisp_disp0.cpp index 789000294..4ee8c5733 100755 --- a/src/core/hle/service/nvdrv/devices/nvdisp_disp0.cpp +++ b/src/core/hle/service/nvdrv/devices/nvdisp_disp0.cpp @@ -48,8 +48,9 @@ void nvdisp_disp0::flip(u32 buffer_handle, u32 offset, u32 format, u32 width, u3 addr, offset, width, height, stride, format); const auto pixel_format = static_cast(format); - const Tegra::FramebufferConfig framebuffer{addr, offset, width, height, - stride, pixel_format, transform, crop_rect}; + const auto transform_flags = static_cast(transform); + const Tegra::FramebufferConfig framebuffer{addr, offset, width, height, + stride, pixel_format, transform_flags, crop_rect}; system.GetPerfStats().EndSystemFrame(); system.GPU().SwapBuffers(&framebuffer); diff --git a/src/core/hle/service/nvdrv/devices/nvhost_ctrl.cpp b/src/core/hle/service/nvdrv/devices/nvhost_ctrl.cpp index 775e76330..8b4867ca7 100755 --- a/src/core/hle/service/nvdrv/devices/nvhost_ctrl.cpp +++ b/src/core/hle/service/nvdrv/devices/nvhost_ctrl.cpp @@ -111,7 +111,6 @@ NvResult nvhost_ctrl::IocCtrlEventWait(const std::vector& input, std::vector event.event->GetWritableEvent().Signal(); return NvResult::Success; } - auto lock = gpu.LockSync(); const u32 current_syncpoint_value = event.fence.value; const s32 diff = current_syncpoint_value - params.threshold; if (diff >= 0) { @@ -132,23 +131,24 @@ NvResult nvhost_ctrl::IocCtrlEventWait(const std::vector& input, std::vector } EventState status = events_interface.status[event_id]; - if (event_id < MaxNvEvents || status == EventState::Free || status == EventState::Registered) { - events_interface.SetEventStatus(event_id, EventState::Waiting); - events_interface.assigned_syncpt[event_id] = params.syncpt_id; - events_interface.assigned_value[event_id] = target_value; - if (is_async) { - params.value = params.syncpt_id << 4; - } else { - params.value = ((params.syncpt_id & 0xfff) << 16) | 0x10000000; - } - params.value |= event_id; - event.event->GetWritableEvent().Clear(); - gpu.RegisterSyncptInterrupt(params.syncpt_id, target_value); + const bool bad_parameter = status != EventState::Free && status != EventState::Registered; + if (bad_parameter) { std::memcpy(output.data(), ¶ms, sizeof(params)); - return NvResult::Timeout; + return NvResult::BadParameter; } + events_interface.SetEventStatus(event_id, EventState::Waiting); + events_interface.assigned_syncpt[event_id] = params.syncpt_id; + events_interface.assigned_value[event_id] = target_value; + if (is_async) { + params.value = params.syncpt_id << 4; + } else { + params.value = ((params.syncpt_id & 0xfff) << 16) | 0x10000000; + } + params.value |= event_id; + event.event->GetWritableEvent().Clear(); + gpu.RegisterSyncptInterrupt(params.syncpt_id, target_value); std::memcpy(output.data(), ¶ms, sizeof(params)); - return NvResult::BadParameter; + return NvResult::Timeout; } NvResult nvhost_ctrl::IocCtrlEventRegister(const std::vector& input, std::vector& output) { diff --git a/src/core/hle/service/nvdrv/devices/nvhost_gpu.cpp b/src/core/hle/service/nvdrv/devices/nvhost_gpu.cpp index c0a380088..54ac105d5 100755 --- a/src/core/hle/service/nvdrv/devices/nvhost_gpu.cpp +++ b/src/core/hle/service/nvdrv/devices/nvhost_gpu.cpp @@ -13,6 +13,14 @@ #include "video_core/memory_manager.h" namespace Service::Nvidia::Devices { +namespace { +Tegra::CommandHeader BuildFenceAction(Tegra::GPU::FenceOperation op, u32 syncpoint_id) { + Tegra::GPU::FenceAction result{}; + result.op.Assign(op); + result.syncpoint_id.Assign(syncpoint_id); + return {result.raw}; +} +} // namespace nvhost_gpu::nvhost_gpu(Core::System& system_, std::shared_ptr nvmap_dev_, SyncpointManager& syncpoint_manager_) @@ -187,7 +195,7 @@ static std::vector BuildWaitCommandList(Fence fence) { {fence.value}, Tegra::BuildCommandHeader(Tegra::BufferMethods::FenceAction, 1, Tegra::SubmissionMode::Increasing), - Tegra::GPU::FenceAction::Build(Tegra::GPU::FenceOperation::Acquire, fence.id), + BuildFenceAction(Tegra::GPU::FenceOperation::Acquire, fence.id), }; } @@ -200,8 +208,7 @@ static std::vector BuildIncrementCommandList(Fence fence, for (u32 count = 0; count < add_increment; ++count) { result.emplace_back(Tegra::BuildCommandHeader(Tegra::BufferMethods::FenceAction, 1, Tegra::SubmissionMode::Increasing)); - result.emplace_back( - Tegra::GPU::FenceAction::Build(Tegra::GPU::FenceOperation::Increment, fence.id)); + result.emplace_back(BuildFenceAction(Tegra::GPU::FenceOperation::Increment, fence.id)); } return result; diff --git a/src/core/hle/service/nvflinger/nvflinger.cpp b/src/core/hle/service/nvflinger/nvflinger.cpp index 3ead813b0..a22811ec1 100755 --- a/src/core/hle/service/nvflinger/nvflinger.cpp +++ b/src/core/hle/service/nvflinger/nvflinger.cpp @@ -13,28 +13,20 @@ #include "common/thread.h" #include "core/core.h" #include "core/core_timing.h" -#include "core/core_timing_util.h" -#include "core/hardware_properties.h" #include "core/hle/kernel/k_readable_event.h" -#include "core/hle/kernel/kernel.h" #include "core/hle/service/nvdrv/devices/nvdisp_disp0.h" #include "core/hle/service/nvdrv/nvdrv.h" #include "core/hle/service/nvflinger/buffer_queue.h" #include "core/hle/service/nvflinger/nvflinger.h" #include "core/hle/service/vi/display/vi_display.h" #include "core/hle/service/vi/layer/vi_layer.h" -#include "core/perf_stats.h" -#include "video_core/renderer_base.h" +#include "video_core/gpu.h" namespace Service::NVFlinger { constexpr auto frame_ns = std::chrono::nanoseconds{1000000000 / 60}; -void NVFlinger::VSyncThread(NVFlinger& nv_flinger) { - nv_flinger.SplitVSync(); -} - -void NVFlinger::SplitVSync() { +void NVFlinger::SplitVSync(std::stop_token stop_token) { system.RegisterHostThread(); std::string name = "yuzu:VSyncThread"; MicroProfileOnThreadCreate(name.c_str()); @@ -45,7 +37,7 @@ void NVFlinger::SplitVSync() { Common::SetCurrentThreadName(name.c_str()); Common::SetCurrentThreadPriority(Common::ThreadPriority::High); s64 delay = 0; - while (is_running) { + while (!stop_token.stop_requested()) { guard->lock(); const s64 time_start = system.CoreTiming().GetGlobalTimeNs().count(); Compose(); @@ -55,7 +47,7 @@ void NVFlinger::SplitVSync() { const s64 next_time = std::max(0, ticks - time_passed - delay); guard->unlock(); if (next_time > 0) { - wait_event->WaitFor(std::chrono::nanoseconds{next_time}); + std::this_thread::sleep_for(std::chrono::nanoseconds{next_time}); } delay = (system.CoreTiming().GetGlobalTimeNs().count() - time_end) - next_time; } @@ -84,9 +76,7 @@ NVFlinger::NVFlinger(Core::System& system_) }); if (system.IsMulticore()) { - is_running = true; - wait_event = std::make_unique(); - vsync_thread = std::make_unique(VSyncThread, std::ref(*this)); + vsync_thread = std::jthread([this](std::stop_token token) { SplitVSync(token); }); } else { system.CoreTiming().ScheduleEvent(frame_ns, composition_event); } @@ -96,14 +86,7 @@ NVFlinger::~NVFlinger() { for (auto& buffer_queue : buffer_queues) { buffer_queue->Disconnect(); } - - if (system.IsMulticore()) { - is_running = false; - wait_event->Set(); - vsync_thread->join(); - vsync_thread.reset(); - wait_event.reset(); - } else { + if (!system.IsMulticore()) { system.CoreTiming().UnscheduleEvent(composition_event, 0); } } diff --git a/src/core/hle/service/nvflinger/nvflinger.h b/src/core/hle/service/nvflinger/nvflinger.h index 6d84cafb4..7935cf773 100755 --- a/src/core/hle/service/nvflinger/nvflinger.h +++ b/src/core/hle/service/nvflinger/nvflinger.h @@ -4,13 +4,10 @@ #pragma once -#include #include #include #include #include -#include -#include #include #include @@ -109,9 +106,7 @@ private: /// Creates a layer with the specified layer ID in the desired display. void CreateLayerAtId(VI::Display& display, u64 layer_id); - static void VSyncThread(NVFlinger& nv_flinger); - - void SplitVSync(); + void SplitVSync(std::stop_token stop_token); std::shared_ptr nvdrv; @@ -133,9 +128,7 @@ private: Core::System& system; - std::unique_ptr vsync_thread; - std::unique_ptr wait_event; - std::atomic is_running{}; + std::jthread vsync_thread; KernelHelpers::ServiceContext service_context; }; diff --git a/src/video_core/cdma_pusher.cpp b/src/video_core/cdma_pusher.cpp index 8b86ad050..a8c4b4415 100755 --- a/src/video_core/cdma_pusher.cpp +++ b/src/video_core/cdma_pusher.cpp @@ -24,6 +24,7 @@ #include "command_classes/vic.h" #include "video_core/cdma_pusher.h" #include "video_core/command_classes/nvdec_common.h" +#include "video_core/command_classes/sync_manager.h" #include "video_core/engines/maxwell_3d.h" #include "video_core/gpu.h" #include "video_core/memory_manager.h" diff --git a/src/video_core/cdma_pusher.h b/src/video_core/cdma_pusher.h index 1bada44dd..87b49d6ea 100755 --- a/src/video_core/cdma_pusher.h +++ b/src/video_core/cdma_pusher.h @@ -9,13 +9,13 @@ #include "common/bit_field.h" #include "common/common_types.h" -#include "video_core/command_classes/sync_manager.h" namespace Tegra { class GPU; class Host1x; class Nvdec; +class SyncptIncrManager; class Vic; enum class ChSubmissionMode : u32 { diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h index 7f4ca6282..f22342dfb 100755 --- a/src/video_core/engines/maxwell_3d.h +++ b/src/video_core/engines/maxwell_3d.h @@ -6,6 +6,7 @@ #include #include +#include #include #include #include diff --git a/src/video_core/framebuffer_config.h b/src/video_core/framebuffer_config.h index b86c3a757..b1d455e30 100755 --- a/src/video_core/framebuffer_config.h +++ b/src/video_core/framebuffer_config.h @@ -4,8 +4,10 @@ #pragma once -namespace Tegra { +#include "common/common_types.h" +#include "common/math_util.h" +namespace Tegra { /** * Struct describing framebuffer configuration */ @@ -16,6 +18,21 @@ struct FramebufferConfig { B8G8R8A8_UNORM = 5, }; + enum class TransformFlags : u32 { + /// No transform flags are set + Unset = 0x00, + /// Flip source image horizontally (around the vertical axis) + FlipH = 0x01, + /// Flip source image vertically (around the horizontal axis) + FlipV = 0x02, + /// Rotate source image 90 degrees clockwise + Rotate90 = 0x04, + /// Rotate source image 180 degrees + Rotate180 = 0x03, + /// Rotate source image 270 degrees clockwise + Rotate270 = 0x07, + }; + VAddr address{}; u32 offset{}; u32 width{}; @@ -23,7 +40,6 @@ struct FramebufferConfig { u32 stride{}; PixelFormat pixel_format{}; - using TransformFlags = Service::NVFlinger::BufferQueue::BufferTransformFlags; TransformFlags transform_flags{}; Common::Rectangle crop_rect; }; diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp index 2ae3639b5..ab7c21a49 100755 --- a/src/video_core/gpu.cpp +++ b/src/video_core/gpu.cpp @@ -2,540 +2,913 @@ // Licensed under GPLv2 or any later version // Refer to the license.txt file included. +#include +#include #include +#include +#include +#include #include "common/assert.h" #include "common/microprofile.h" #include "common/settings.h" #include "core/core.h" #include "core/core_timing.h" -#include "core/core_timing_util.h" #include "core/frontend/emu_window.h" #include "core/hardware_interrupt_manager.h" -#include "core/memory.h" +#include "core/hle/service/nvdrv/nvdata.h" +#include "core/hle/service/nvflinger/buffer_queue.h" #include "core/perf_stats.h" +#include "video_core/cdma_pusher.h" +#include "video_core/dma_pusher.h" #include "video_core/engines/fermi_2d.h" #include "video_core/engines/kepler_compute.h" #include "video_core/engines/kepler_memory.h" #include "video_core/engines/maxwell_3d.h" #include "video_core/engines/maxwell_dma.h" #include "video_core/gpu.h" +#include "video_core/gpu_thread.h" #include "video_core/memory_manager.h" #include "video_core/renderer_base.h" #include "video_core/shader_notify.h" -#include "video_core/video_core.h" namespace Tegra { MICROPROFILE_DEFINE(GPU_wait, "GPU", "Wait for the GPU", MP_RGB(128, 128, 192)); -GPU::GPU(Core::System& system_, bool is_async_, bool use_nvdec_) - : system{system_}, memory_manager{std::make_unique(system)}, - dma_pusher{std::make_unique(system, *this)}, use_nvdec{use_nvdec_}, - maxwell_3d{std::make_unique(system, *memory_manager)}, - fermi_2d{std::make_unique()}, - kepler_compute{std::make_unique(system, *memory_manager)}, - maxwell_dma{std::make_unique(system, *memory_manager)}, - kepler_memory{std::make_unique(system, *memory_manager)}, - shader_notify{std::make_unique()}, is_async{is_async_}, - gpu_thread{system_, is_async_} {} +struct GPU::Impl { + explicit Impl(GPU& gpu_, Core::System& system_, bool is_async_, bool use_nvdec_) + : gpu{gpu_}, system{system_}, memory_manager{std::make_unique( + system)}, + dma_pusher{std::make_unique(system, gpu)}, use_nvdec{use_nvdec_}, + maxwell_3d{std::make_unique(system, *memory_manager)}, + fermi_2d{std::make_unique()}, + kepler_compute{std::make_unique(system, *memory_manager)}, + maxwell_dma{std::make_unique(system, *memory_manager)}, + kepler_memory{std::make_unique(system, *memory_manager)}, + shader_notify{std::make_unique()}, is_async{is_async_}, + gpu_thread{system_, is_async_} {} + + ~Impl() = default; + + /// Binds a renderer to the GPU. + void BindRenderer(std::unique_ptr renderer_) { + renderer = std::move(renderer_); + rasterizer = renderer->ReadRasterizer(); + + memory_manager->BindRasterizer(rasterizer); + maxwell_3d->BindRasterizer(rasterizer); + fermi_2d->BindRasterizer(rasterizer); + kepler_compute->BindRasterizer(rasterizer); + maxwell_dma->BindRasterizer(rasterizer); + } + + /// Calls a GPU method. + void CallMethod(const GPU::MethodCall& method_call) { + LOG_TRACE(HW_GPU, "Processing method {:08X} on subchannel {}", method_call.method, + method_call.subchannel); + + ASSERT(method_call.subchannel < bound_engines.size()); + + if (ExecuteMethodOnEngine(method_call.method)) { + CallEngineMethod(method_call); + } else { + CallPullerMethod(method_call); + } + } + + /// Calls a GPU multivalue method. + void CallMultiMethod(u32 method, u32 subchannel, const u32* base_start, u32 amount, + u32 methods_pending) { + LOG_TRACE(HW_GPU, "Processing method {:08X} on subchannel {}", method, subchannel); + + ASSERT(subchannel < bound_engines.size()); + + if (ExecuteMethodOnEngine(method)) { + CallEngineMultiMethod(method, subchannel, base_start, amount, methods_pending); + } else { + for (std::size_t i = 0; i < amount; i++) { + CallPullerMethod(GPU::MethodCall{ + method, + base_start[i], + subchannel, + methods_pending - static_cast(i), + }); + } + } + } + + /// Flush all current written commands into the host GPU for execution. + void FlushCommands() { + rasterizer->FlushCommands(); + } + + /// Synchronizes CPU writes with Host GPU memory. + void SyncGuestHost() { + rasterizer->SyncGuestHost(); + } + + /// Signal the ending of command list. + void OnCommandListEnd() { + if (is_async) { + // This command only applies to asynchronous GPU mode + gpu_thread.OnCommandListEnd(); + } + } + + /// Request a host GPU memory flush from the CPU. + [[nodiscard]] u64 RequestFlush(VAddr addr, std::size_t size) { + std::unique_lock lck{flush_request_mutex}; + const u64 fence = ++last_flush_fence; + flush_requests.emplace_back(fence, addr, size); + return fence; + } + + /// Obtains current flush request fence id. + [[nodiscard]] u64 CurrentFlushRequestFence() const { + return current_flush_fence.load(std::memory_order_relaxed); + } + + /// Tick pending requests within the GPU. + void TickWork() { + std::unique_lock lck{flush_request_mutex}; + while (!flush_requests.empty()) { + auto& request = flush_requests.front(); + const u64 fence = request.fence; + const VAddr addr = request.addr; + const std::size_t size = request.size; + flush_requests.pop_front(); + flush_request_mutex.unlock(); + rasterizer->FlushRegion(addr, size); + current_flush_fence.store(fence); + flush_request_mutex.lock(); + } + } + + /// Returns a reference to the Maxwell3D GPU engine. + [[nodiscard]] Engines::Maxwell3D& Maxwell3D() { + return *maxwell_3d; + } + + /// Returns a const reference to the Maxwell3D GPU engine. + [[nodiscard]] const Engines::Maxwell3D& Maxwell3D() const { + return *maxwell_3d; + } + + /// Returns a reference to the KeplerCompute GPU engine. + [[nodiscard]] Engines::KeplerCompute& KeplerCompute() { + return *kepler_compute; + } + + /// Returns a reference to the KeplerCompute GPU engine. + [[nodiscard]] const Engines::KeplerCompute& KeplerCompute() const { + return *kepler_compute; + } + + /// Returns a reference to the GPU memory manager. + [[nodiscard]] Tegra::MemoryManager& MemoryManager() { + return *memory_manager; + } + + /// Returns a const reference to the GPU memory manager. + [[nodiscard]] const Tegra::MemoryManager& MemoryManager() const { + return *memory_manager; + } + + /// Returns a reference to the GPU DMA pusher. + [[nodiscard]] Tegra::DmaPusher& DmaPusher() { + return *dma_pusher; + } + + /// Returns a const reference to the GPU DMA pusher. + [[nodiscard]] const Tegra::DmaPusher& DmaPusher() const { + return *dma_pusher; + } + + /// Returns a reference to the GPU CDMA pusher. + [[nodiscard]] Tegra::CDmaPusher& CDmaPusher() { + return *cdma_pusher; + } + + /// Returns a const reference to the GPU CDMA pusher. + [[nodiscard]] const Tegra::CDmaPusher& CDmaPusher() const { + return *cdma_pusher; + } + + /// Returns a reference to the underlying renderer. + [[nodiscard]] VideoCore::RendererBase& Renderer() { + return *renderer; + } + + /// Returns a const reference to the underlying renderer. + [[nodiscard]] const VideoCore::RendererBase& Renderer() const { + return *renderer; + } + + /// Returns a reference to the shader notifier. + [[nodiscard]] VideoCore::ShaderNotify& ShaderNotify() { + return *shader_notify; + } + + /// Returns a const reference to the shader notifier. + [[nodiscard]] const VideoCore::ShaderNotify& ShaderNotify() const { + return *shader_notify; + } + + /// Allows the CPU/NvFlinger to wait on the GPU before presenting a frame. + void WaitFence(u32 syncpoint_id, u32 value) { + // Synced GPU, is always in sync + if (!is_async) { + return; + } + if (syncpoint_id == UINT32_MAX) { + // TODO: Research what this does. + LOG_ERROR(HW_GPU, "Waiting for syncpoint -1 not implemented"); + return; + } + MICROPROFILE_SCOPE(GPU_wait); + std::unique_lock lock{sync_mutex}; + sync_cv.wait(lock, [=, this] { + if (shutting_down.load(std::memory_order_relaxed)) { + // We're shutting down, ensure no threads continue to wait for the next syncpoint + return true; + } + return syncpoints.at(syncpoint_id).load() >= value; + }); + } + + void IncrementSyncPoint(u32 syncpoint_id) { + auto& syncpoint = syncpoints.at(syncpoint_id); + syncpoint++; + std::lock_guard lock{sync_mutex}; + sync_cv.notify_all(); + auto& interrupt = syncpt_interrupts.at(syncpoint_id); + if (!interrupt.empty()) { + u32 value = syncpoint.load(); + auto it = interrupt.begin(); + while (it != interrupt.end()) { + if (value >= *it) { + TriggerCpuInterrupt(syncpoint_id, *it); + it = interrupt.erase(it); + continue; + } + it++; + } + } + } + + [[nodiscard]] u32 GetSyncpointValue(u32 syncpoint_id) const { + return syncpoints.at(syncpoint_id).load(); + } + + void RegisterSyncptInterrupt(u32 syncpoint_id, u32 value) { + std::lock_guard lock{sync_mutex}; + auto& interrupt = syncpt_interrupts.at(syncpoint_id); + bool contains = std::any_of(interrupt.begin(), interrupt.end(), + [value](u32 in_value) { return in_value == value; }); + if (contains) { + return; + } + interrupt.emplace_back(value); + } + + [[nodiscard]] bool CancelSyncptInterrupt(u32 syncpoint_id, u32 value) { + std::lock_guard lock{sync_mutex}; + auto& interrupt = syncpt_interrupts.at(syncpoint_id); + const auto iter = + std::find_if(interrupt.begin(), interrupt.end(), + [value](u32 interrupt_value) { return value == interrupt_value; }); + + if (iter == interrupt.end()) { + return false; + } + interrupt.erase(iter); + return true; + } + + [[nodiscard]] u64 GetTicks() const { + // This values were reversed engineered by fincs from NVN + // The gpu clock is reported in units of 385/625 nanoseconds + constexpr u64 gpu_ticks_num = 384; + constexpr u64 gpu_ticks_den = 625; + + u64 nanoseconds = system.CoreTiming().GetGlobalTimeNs().count(); + if (Settings::values.use_fast_gpu_time.GetValue()) { + nanoseconds /= 256; + } + const u64 nanoseconds_num = nanoseconds / gpu_ticks_den; + const u64 nanoseconds_rem = nanoseconds % gpu_ticks_den; + return nanoseconds_num * gpu_ticks_num + (nanoseconds_rem * gpu_ticks_num) / gpu_ticks_den; + } + + [[nodiscard]] bool IsAsync() const { + return is_async; + } + + [[nodiscard]] bool UseNvdec() const { + return use_nvdec; + } + + void RendererFrameEndNotify() { + system.GetPerfStats().EndGameFrame(); + } + + /// Performs any additional setup necessary in order to begin GPU emulation. + /// This can be used to launch any necessary threads and register any necessary + /// core timing events. + void Start() { + gpu_thread.StartThread(*renderer, renderer->Context(), *dma_pusher); + cpu_context = renderer->GetRenderWindow().CreateSharedContext(); + cpu_context->MakeCurrent(); + } + + /// Obtain the CPU Context + void ObtainContext() { + cpu_context->MakeCurrent(); + } + + /// Release the CPU Context + void ReleaseContext() { + cpu_context->DoneCurrent(); + } + + /// Push GPU command entries to be processed + void PushGPUEntries(Tegra::CommandList&& entries) { + gpu_thread.SubmitList(std::move(entries)); + } + + /// Push GPU command buffer entries to be processed + void PushCommandBuffer(Tegra::ChCommandHeaderList& entries) { + if (!use_nvdec) { + return; + } + + if (!cdma_pusher) { + cdma_pusher = std::make_unique(gpu); + } + + // SubmitCommandBuffer would make the nvdec operations async, this is not currently working + // TODO(ameerj): RE proper async nvdec operation + // gpu_thread.SubmitCommandBuffer(std::move(entries)); + + cdma_pusher->ProcessEntries(std::move(entries)); + } + + /// Frees the CDMAPusher instance to free up resources + void ClearCdmaInstance() { + cdma_pusher.reset(); + } + + /// Swap buffers (render frame) + void SwapBuffers(const Tegra::FramebufferConfig* framebuffer) { + gpu_thread.SwapBuffers(framebuffer); + } + + /// Notify rasterizer that any caches of the specified region should be flushed to Switch memory + void FlushRegion(VAddr addr, u64 size) { + gpu_thread.FlushRegion(addr, size); + } + + /// Notify rasterizer that any caches of the specified region should be invalidated + void InvalidateRegion(VAddr addr, u64 size) { + gpu_thread.InvalidateRegion(addr, size); + } + + /// Notify rasterizer that any caches of the specified region should be flushed and invalidated + void FlushAndInvalidateRegion(VAddr addr, u64 size) { + gpu_thread.FlushAndInvalidateRegion(addr, size); + } + + void TriggerCpuInterrupt(u32 syncpoint_id, u32 value) const { + auto& interrupt_manager = system.InterruptManager(); + interrupt_manager.GPUInterruptSyncpt(syncpoint_id, value); + } + + void ProcessBindMethod(const GPU::MethodCall& method_call) { + // Bind the current subchannel to the desired engine id. + LOG_DEBUG(HW_GPU, "Binding subchannel {} to engine {}", method_call.subchannel, + method_call.argument); + const auto engine_id = static_cast(method_call.argument); + bound_engines[method_call.subchannel] = static_cast(engine_id); + switch (engine_id) { + case EngineID::FERMI_TWOD_A: + dma_pusher->BindSubchannel(fermi_2d.get(), method_call.subchannel); + break; + case EngineID::MAXWELL_B: + dma_pusher->BindSubchannel(maxwell_3d.get(), method_call.subchannel); + break; + case EngineID::KEPLER_COMPUTE_B: + dma_pusher->BindSubchannel(kepler_compute.get(), method_call.subchannel); + break; + case EngineID::MAXWELL_DMA_COPY_A: + dma_pusher->BindSubchannel(maxwell_dma.get(), method_call.subchannel); + break; + case EngineID::KEPLER_INLINE_TO_MEMORY_B: + dma_pusher->BindSubchannel(kepler_memory.get(), method_call.subchannel); + break; + default: + UNIMPLEMENTED_MSG("Unimplemented engine {:04X}", engine_id); + } + } + + void ProcessFenceActionMethod() { + switch (regs.fence_action.op) { + case GPU::FenceOperation::Acquire: + WaitFence(regs.fence_action.syncpoint_id, regs.fence_value); + break; + case GPU::FenceOperation::Increment: + IncrementSyncPoint(regs.fence_action.syncpoint_id); + break; + default: + UNIMPLEMENTED_MSG("Unimplemented operation {}", regs.fence_action.op.Value()); + } + } + + void ProcessWaitForInterruptMethod() { + // TODO(bunnei) ImplementMe + LOG_WARNING(HW_GPU, "(STUBBED) called"); + } + + void ProcessSemaphoreTriggerMethod() { + const auto semaphoreOperationMask = 0xF; + const auto op = + static_cast(regs.semaphore_trigger & semaphoreOperationMask); + if (op == GpuSemaphoreOperation::WriteLong) { + struct Block { + u32 sequence; + u32 zeros = 0; + u64 timestamp; + }; + + Block block{}; + block.sequence = regs.semaphore_sequence; + // TODO(Kmather73): Generate a real GPU timestamp and write it here instead of + // CoreTiming + block.timestamp = GetTicks(); + memory_manager->WriteBlock(regs.semaphore_address.SemaphoreAddress(), &block, + sizeof(block)); + } else { + const u32 word{memory_manager->Read(regs.semaphore_address.SemaphoreAddress())}; + if ((op == GpuSemaphoreOperation::AcquireEqual && word == regs.semaphore_sequence) || + (op == GpuSemaphoreOperation::AcquireGequal && + static_cast(word - regs.semaphore_sequence) > 0) || + (op == GpuSemaphoreOperation::AcquireMask && (word & regs.semaphore_sequence))) { + // Nothing to do in this case + } else { + regs.acquire_source = true; + regs.acquire_value = regs.semaphore_sequence; + if (op == GpuSemaphoreOperation::AcquireEqual) { + regs.acquire_active = true; + regs.acquire_mode = false; + } else if (op == GpuSemaphoreOperation::AcquireGequal) { + regs.acquire_active = true; + regs.acquire_mode = true; + } else if (op == GpuSemaphoreOperation::AcquireMask) { + // TODO(kemathe) The acquire mask operation waits for a value that, ANDed with + // semaphore_sequence, gives a non-0 result + LOG_ERROR(HW_GPU, "Invalid semaphore operation AcquireMask not implemented"); + } else { + LOG_ERROR(HW_GPU, "Invalid semaphore operation"); + } + } + } + } + + void ProcessSemaphoreRelease() { + memory_manager->Write(regs.semaphore_address.SemaphoreAddress(), + regs.semaphore_release); + } + + void ProcessSemaphoreAcquire() { + const u32 word = memory_manager->Read(regs.semaphore_address.SemaphoreAddress()); + const auto value = regs.semaphore_acquire; + if (word != value) { + regs.acquire_active = true; + regs.acquire_value = value; + // TODO(kemathe73) figure out how to do the acquire_timeout + regs.acquire_mode = false; + regs.acquire_source = false; + } + } + + /// Calls a GPU puller method. + void CallPullerMethod(const GPU::MethodCall& method_call) { + regs.reg_array[method_call.method] = method_call.argument; + const auto method = static_cast(method_call.method); + + switch (method) { + case BufferMethods::BindObject: { + ProcessBindMethod(method_call); + break; + } + case BufferMethods::Nop: + case BufferMethods::SemaphoreAddressHigh: + case BufferMethods::SemaphoreAddressLow: + case BufferMethods::SemaphoreSequence: + case BufferMethods::UnkCacheFlush: + case BufferMethods::WrcacheFlush: + case BufferMethods::FenceValue: + break; + case BufferMethods::RefCnt: + rasterizer->SignalReference(); + break; + case BufferMethods::FenceAction: + ProcessFenceActionMethod(); + break; + case BufferMethods::WaitForInterrupt: + ProcessWaitForInterruptMethod(); + break; + case BufferMethods::SemaphoreTrigger: { + ProcessSemaphoreTriggerMethod(); + break; + } + case BufferMethods::NotifyIntr: { + // TODO(Kmather73): Research and implement this method. + LOG_ERROR(HW_GPU, "Special puller engine method NotifyIntr not implemented"); + break; + } + case BufferMethods::Unk28: { + // TODO(Kmather73): Research and implement this method. + LOG_ERROR(HW_GPU, "Special puller engine method Unk28 not implemented"); + break; + } + case BufferMethods::SemaphoreAcquire: { + ProcessSemaphoreAcquire(); + break; + } + case BufferMethods::SemaphoreRelease: { + ProcessSemaphoreRelease(); + break; + } + case BufferMethods::Yield: { + // TODO(Kmather73): Research and implement this method. + LOG_ERROR(HW_GPU, "Special puller engine method Yield not implemented"); + break; + } + default: + LOG_ERROR(HW_GPU, "Special puller engine method {:X} not implemented", method); + break; + } + } + + /// Calls a GPU engine method. + void CallEngineMethod(const GPU::MethodCall& method_call) { + const EngineID engine = bound_engines[method_call.subchannel]; + + switch (engine) { + case EngineID::FERMI_TWOD_A: + fermi_2d->CallMethod(method_call.method, method_call.argument, + method_call.IsLastCall()); + break; + case EngineID::MAXWELL_B: + maxwell_3d->CallMethod(method_call.method, method_call.argument, + method_call.IsLastCall()); + break; + case EngineID::KEPLER_COMPUTE_B: + kepler_compute->CallMethod(method_call.method, method_call.argument, + method_call.IsLastCall()); + break; + case EngineID::MAXWELL_DMA_COPY_A: + maxwell_dma->CallMethod(method_call.method, method_call.argument, + method_call.IsLastCall()); + break; + case EngineID::KEPLER_INLINE_TO_MEMORY_B: + kepler_memory->CallMethod(method_call.method, method_call.argument, + method_call.IsLastCall()); + break; + default: + UNIMPLEMENTED_MSG("Unimplemented engine"); + } + } + + /// Calls a GPU engine multivalue method. + void CallEngineMultiMethod(u32 method, u32 subchannel, const u32* base_start, u32 amount, + u32 methods_pending) { + const EngineID engine = bound_engines[subchannel]; + + switch (engine) { + case EngineID::FERMI_TWOD_A: + fermi_2d->CallMultiMethod(method, base_start, amount, methods_pending); + break; + case EngineID::MAXWELL_B: + maxwell_3d->CallMultiMethod(method, base_start, amount, methods_pending); + break; + case EngineID::KEPLER_COMPUTE_B: + kepler_compute->CallMultiMethod(method, base_start, amount, methods_pending); + break; + case EngineID::MAXWELL_DMA_COPY_A: + maxwell_dma->CallMultiMethod(method, base_start, amount, methods_pending); + break; + case EngineID::KEPLER_INLINE_TO_MEMORY_B: + kepler_memory->CallMultiMethod(method, base_start, amount, methods_pending); + break; + default: + UNIMPLEMENTED_MSG("Unimplemented engine"); + } + } + + /// Determines where the method should be executed. + [[nodiscard]] bool ExecuteMethodOnEngine(u32 method) { + const auto buffer_method = static_cast(method); + return buffer_method >= BufferMethods::NonPullerMethods; + } + + struct Regs { + static constexpr size_t NUM_REGS = 0x40; + + union { + struct { + INSERT_PADDING_WORDS_NOINIT(0x4); + struct { + u32 address_high; + u32 address_low; + + [[nodiscard]] GPUVAddr SemaphoreAddress() const { + return static_cast((static_cast(address_high) << 32) | + address_low); + } + } semaphore_address; + + u32 semaphore_sequence; + u32 semaphore_trigger; + INSERT_PADDING_WORDS_NOINIT(0xC); + + // The pusher and the puller share the reference counter, the pusher only has read + // access + u32 reference_count; + INSERT_PADDING_WORDS_NOINIT(0x5); + + u32 semaphore_acquire; + u32 semaphore_release; + u32 fence_value; + GPU::FenceAction fence_action; + INSERT_PADDING_WORDS_NOINIT(0xE2); + + // Puller state + u32 acquire_mode; + u32 acquire_source; + u32 acquire_active; + u32 acquire_timeout; + u32 acquire_value; + }; + std::array reg_array; + }; + } regs{}; + + GPU& gpu; + Core::System& system; + std::unique_ptr memory_manager; + std::unique_ptr dma_pusher; + std::unique_ptr cdma_pusher; + std::unique_ptr renderer; + VideoCore::RasterizerInterface* rasterizer = nullptr; + const bool use_nvdec; + + /// Mapping of command subchannels to their bound engine ids + std::array bound_engines{}; + /// 3D engine + std::unique_ptr maxwell_3d; + /// 2D engine + std::unique_ptr fermi_2d; + /// Compute engine + std::unique_ptr kepler_compute; + /// DMA engine + std::unique_ptr maxwell_dma; + /// Inline memory engine + std::unique_ptr kepler_memory; + /// Shader build notifier + std::unique_ptr shader_notify; + /// When true, we are about to shut down emulation session, so terminate outstanding tasks + std::atomic_bool shutting_down{}; + + std::array, Service::Nvidia::MaxSyncPoints> syncpoints{}; + + std::array, Service::Nvidia::MaxSyncPoints> syncpt_interrupts; + + std::mutex sync_mutex; + std::mutex device_mutex; + + std::condition_variable sync_cv; + + struct FlushRequest { + explicit FlushRequest(u64 fence_, VAddr addr_, std::size_t size_) + : fence{fence_}, addr{addr_}, size{size_} {} + u64 fence; + VAddr addr; + std::size_t size; + }; + + std::list flush_requests; + std::atomic current_flush_fence{}; + u64 last_flush_fence{}; + std::mutex flush_request_mutex; + + const bool is_async; + + VideoCommon::GPUThread::ThreadManager gpu_thread; + std::unique_ptr cpu_context; + +#define ASSERT_REG_POSITION(field_name, position) \ + static_assert(offsetof(Regs, field_name) == position * 4, \ + "Field " #field_name " has invalid position") + + ASSERT_REG_POSITION(semaphore_address, 0x4); + ASSERT_REG_POSITION(semaphore_sequence, 0x6); + ASSERT_REG_POSITION(semaphore_trigger, 0x7); + ASSERT_REG_POSITION(reference_count, 0x14); + ASSERT_REG_POSITION(semaphore_acquire, 0x1A); + ASSERT_REG_POSITION(semaphore_release, 0x1B); + ASSERT_REG_POSITION(fence_value, 0x1C); + ASSERT_REG_POSITION(fence_action, 0x1D); + + ASSERT_REG_POSITION(acquire_mode, 0x100); + ASSERT_REG_POSITION(acquire_source, 0x101); + ASSERT_REG_POSITION(acquire_active, 0x102); + ASSERT_REG_POSITION(acquire_timeout, 0x103); + ASSERT_REG_POSITION(acquire_value, 0x104); + +#undef ASSERT_REG_POSITION + + enum class GpuSemaphoreOperation { + AcquireEqual = 0x1, + WriteLong = 0x2, + AcquireGequal = 0x4, + AcquireMask = 0x8, + }; +}; + +GPU::GPU(Core::System& system, bool is_async, bool use_nvdec) + : impl{std::make_unique(*this, system, is_async, use_nvdec)} {} GPU::~GPU() = default; -void GPU::BindRenderer(std::unique_ptr renderer_) { - renderer = std::move(renderer_); - rasterizer = renderer->ReadRasterizer(); - - memory_manager->BindRasterizer(rasterizer); - maxwell_3d->BindRasterizer(rasterizer); - fermi_2d->BindRasterizer(rasterizer); - kepler_compute->BindRasterizer(rasterizer); - maxwell_dma->BindRasterizer(rasterizer); +void GPU::BindRenderer(std::unique_ptr renderer) { + impl->BindRenderer(std::move(renderer)); } -Engines::Maxwell3D& GPU::Maxwell3D() { - return *maxwell_3d; -} - -const Engines::Maxwell3D& GPU::Maxwell3D() const { - return *maxwell_3d; -} - -Engines::KeplerCompute& GPU::KeplerCompute() { - return *kepler_compute; -} - -const Engines::KeplerCompute& GPU::KeplerCompute() const { - return *kepler_compute; -} - -MemoryManager& GPU::MemoryManager() { - return *memory_manager; -} - -const MemoryManager& GPU::MemoryManager() const { - return *memory_manager; -} - -DmaPusher& GPU::DmaPusher() { - return *dma_pusher; -} - -Tegra::CDmaPusher& GPU::CDmaPusher() { - return *cdma_pusher; -} - -const DmaPusher& GPU::DmaPusher() const { - return *dma_pusher; -} - -const Tegra::CDmaPusher& GPU::CDmaPusher() const { - return *cdma_pusher; -} - -void GPU::WaitFence(u32 syncpoint_id, u32 value) { - // Synced GPU, is always in sync - if (!is_async) { - return; - } - if (syncpoint_id == UINT32_MAX) { - // TODO: Research what this does. - LOG_ERROR(HW_GPU, "Waiting for syncpoint -1 not implemented"); - return; - } - MICROPROFILE_SCOPE(GPU_wait); - std::unique_lock lock{sync_mutex}; - sync_cv.wait(lock, [=, this] { - if (shutting_down.load(std::memory_order_relaxed)) { - // We're shutting down, ensure no threads continue to wait for the next syncpoint - return true; - } - return syncpoints.at(syncpoint_id).load() >= value; - }); -} - -void GPU::IncrementSyncPoint(const u32 syncpoint_id) { - auto& syncpoint = syncpoints.at(syncpoint_id); - syncpoint++; - std::lock_guard lock{sync_mutex}; - sync_cv.notify_all(); - auto& interrupt = syncpt_interrupts.at(syncpoint_id); - if (!interrupt.empty()) { - u32 value = syncpoint.load(); - auto it = interrupt.begin(); - while (it != interrupt.end()) { - if (value >= *it) { - TriggerCpuInterrupt(syncpoint_id, *it); - it = interrupt.erase(it); - continue; - } - it++; - } - } -} - -u32 GPU::GetSyncpointValue(const u32 syncpoint_id) const { - return syncpoints.at(syncpoint_id).load(); -} - -void GPU::RegisterSyncptInterrupt(const u32 syncpoint_id, const u32 value) { - auto& interrupt = syncpt_interrupts.at(syncpoint_id); - bool contains = std::any_of(interrupt.begin(), interrupt.end(), - [value](u32 in_value) { return in_value == value; }); - if (contains) { - return; - } - interrupt.emplace_back(value); -} - -bool GPU::CancelSyncptInterrupt(const u32 syncpoint_id, const u32 value) { - std::lock_guard lock{sync_mutex}; - auto& interrupt = syncpt_interrupts.at(syncpoint_id); - const auto iter = - std::find_if(interrupt.begin(), interrupt.end(), - [value](u32 interrupt_value) { return value == interrupt_value; }); - - if (iter == interrupt.end()) { - return false; - } - interrupt.erase(iter); - return true; -} - -u64 GPU::RequestFlush(VAddr addr, std::size_t size) { - std::unique_lock lck{flush_request_mutex}; - const u64 fence = ++last_flush_fence; - flush_requests.emplace_back(fence, addr, size); - return fence; -} - -void GPU::TickWork() { - std::unique_lock lck{flush_request_mutex}; - while (!flush_requests.empty()) { - auto& request = flush_requests.front(); - const u64 fence = request.fence; - const VAddr addr = request.addr; - const std::size_t size = request.size; - flush_requests.pop_front(); - flush_request_mutex.unlock(); - rasterizer->FlushRegion(addr, size); - current_flush_fence.store(fence); - flush_request_mutex.lock(); - } -} - -u64 GPU::GetTicks() const { - // This values were reversed engineered by fincs from NVN - // The gpu clock is reported in units of 385/625 nanoseconds - constexpr u64 gpu_ticks_num = 384; - constexpr u64 gpu_ticks_den = 625; - - u64 nanoseconds = system.CoreTiming().GetGlobalTimeNs().count(); - if (Settings::values.use_fast_gpu_time.GetValue()) { - nanoseconds /= 256; - } - const u64 nanoseconds_num = nanoseconds / gpu_ticks_den; - const u64 nanoseconds_rem = nanoseconds % gpu_ticks_den; - return nanoseconds_num * gpu_ticks_num + (nanoseconds_rem * gpu_ticks_num) / gpu_ticks_den; -} - -void GPU::RendererFrameEndNotify() { - system.GetPerfStats().EndGameFrame(); -} - -void GPU::FlushCommands() { - rasterizer->FlushCommands(); -} - -void GPU::SyncGuestHost() { - rasterizer->SyncGuestHost(); -} - -enum class GpuSemaphoreOperation { - AcquireEqual = 0x1, - WriteLong = 0x2, - AcquireGequal = 0x4, - AcquireMask = 0x8, -}; - void GPU::CallMethod(const MethodCall& method_call) { - LOG_TRACE(HW_GPU, "Processing method {:08X} on subchannel {}", method_call.method, - method_call.subchannel); - - ASSERT(method_call.subchannel < bound_engines.size()); - - if (ExecuteMethodOnEngine(method_call.method)) { - CallEngineMethod(method_call); - } else { - CallPullerMethod(method_call); - } + impl->CallMethod(method_call); } void GPU::CallMultiMethod(u32 method, u32 subchannel, const u32* base_start, u32 amount, u32 methods_pending) { - LOG_TRACE(HW_GPU, "Processing method {:08X} on subchannel {}", method, subchannel); - - ASSERT(subchannel < bound_engines.size()); - - if (ExecuteMethodOnEngine(method)) { - CallEngineMultiMethod(method, subchannel, base_start, amount, methods_pending); - } else { - for (std::size_t i = 0; i < amount; i++) { - CallPullerMethod(MethodCall{ - method, - base_start[i], - subchannel, - methods_pending - static_cast(i), - }); - } - } + impl->CallMultiMethod(method, subchannel, base_start, amount, methods_pending); } -bool GPU::ExecuteMethodOnEngine(u32 method) { - const auto buffer_method = static_cast(method); - return buffer_method >= BufferMethods::NonPullerMethods; +void GPU::FlushCommands() { + impl->FlushCommands(); } -void GPU::CallPullerMethod(const MethodCall& method_call) { - regs.reg_array[method_call.method] = method_call.argument; - const auto method = static_cast(method_call.method); - - switch (method) { - case BufferMethods::BindObject: { - ProcessBindMethod(method_call); - break; - } - case BufferMethods::Nop: - case BufferMethods::SemaphoreAddressHigh: - case BufferMethods::SemaphoreAddressLow: - case BufferMethods::SemaphoreSequence: - case BufferMethods::UnkCacheFlush: - case BufferMethods::WrcacheFlush: - case BufferMethods::FenceValue: - break; - case BufferMethods::RefCnt: - rasterizer->SignalReference(); - break; - case BufferMethods::FenceAction: - ProcessFenceActionMethod(); - break; - case BufferMethods::WaitForInterrupt: - ProcessWaitForInterruptMethod(); - break; - case BufferMethods::SemaphoreTrigger: { - ProcessSemaphoreTriggerMethod(); - break; - } - case BufferMethods::NotifyIntr: { - // TODO(Kmather73): Research and implement this method. - LOG_ERROR(HW_GPU, "Special puller engine method NotifyIntr not implemented"); - break; - } - case BufferMethods::Unk28: { - // TODO(Kmather73): Research and implement this method. - LOG_ERROR(HW_GPU, "Special puller engine method Unk28 not implemented"); - break; - } - case BufferMethods::SemaphoreAcquire: { - ProcessSemaphoreAcquire(); - break; - } - case BufferMethods::SemaphoreRelease: { - ProcessSemaphoreRelease(); - break; - } - case BufferMethods::Yield: { - // TODO(Kmather73): Research and implement this method. - LOG_ERROR(HW_GPU, "Special puller engine method Yield not implemented"); - break; - } - default: - LOG_ERROR(HW_GPU, "Special puller engine method {:X} not implemented", method); - break; - } -} - -void GPU::CallEngineMethod(const MethodCall& method_call) { - const EngineID engine = bound_engines[method_call.subchannel]; - - switch (engine) { - case EngineID::FERMI_TWOD_A: - fermi_2d->CallMethod(method_call.method, method_call.argument, method_call.IsLastCall()); - break; - case EngineID::MAXWELL_B: - maxwell_3d->CallMethod(method_call.method, method_call.argument, method_call.IsLastCall()); - break; - case EngineID::KEPLER_COMPUTE_B: - kepler_compute->CallMethod(method_call.method, method_call.argument, - method_call.IsLastCall()); - break; - case EngineID::MAXWELL_DMA_COPY_A: - maxwell_dma->CallMethod(method_call.method, method_call.argument, method_call.IsLastCall()); - break; - case EngineID::KEPLER_INLINE_TO_MEMORY_B: - kepler_memory->CallMethod(method_call.method, method_call.argument, - method_call.IsLastCall()); - break; - default: - UNIMPLEMENTED_MSG("Unimplemented engine"); - } -} - -void GPU::CallEngineMultiMethod(u32 method, u32 subchannel, const u32* base_start, u32 amount, - u32 methods_pending) { - const EngineID engine = bound_engines[subchannel]; - - switch (engine) { - case EngineID::FERMI_TWOD_A: - fermi_2d->CallMultiMethod(method, base_start, amount, methods_pending); - break; - case EngineID::MAXWELL_B: - maxwell_3d->CallMultiMethod(method, base_start, amount, methods_pending); - break; - case EngineID::KEPLER_COMPUTE_B: - kepler_compute->CallMultiMethod(method, base_start, amount, methods_pending); - break; - case EngineID::MAXWELL_DMA_COPY_A: - maxwell_dma->CallMultiMethod(method, base_start, amount, methods_pending); - break; - case EngineID::KEPLER_INLINE_TO_MEMORY_B: - kepler_memory->CallMultiMethod(method, base_start, amount, methods_pending); - break; - default: - UNIMPLEMENTED_MSG("Unimplemented engine"); - } -} - -void GPU::ProcessBindMethod(const MethodCall& method_call) { - // Bind the current subchannel to the desired engine id. - LOG_DEBUG(HW_GPU, "Binding subchannel {} to engine {}", method_call.subchannel, - method_call.argument); - const auto engine_id = static_cast(method_call.argument); - bound_engines[method_call.subchannel] = static_cast(engine_id); - switch (engine_id) { - case EngineID::FERMI_TWOD_A: - dma_pusher->BindSubchannel(fermi_2d.get(), method_call.subchannel); - break; - case EngineID::MAXWELL_B: - dma_pusher->BindSubchannel(maxwell_3d.get(), method_call.subchannel); - break; - case EngineID::KEPLER_COMPUTE_B: - dma_pusher->BindSubchannel(kepler_compute.get(), method_call.subchannel); - break; - case EngineID::MAXWELL_DMA_COPY_A: - dma_pusher->BindSubchannel(maxwell_dma.get(), method_call.subchannel); - break; - case EngineID::KEPLER_INLINE_TO_MEMORY_B: - dma_pusher->BindSubchannel(kepler_memory.get(), method_call.subchannel); - break; - default: - UNIMPLEMENTED_MSG("Unimplemented engine {:04X}", engine_id); - } -} - -void GPU::ProcessFenceActionMethod() { - switch (regs.fence_action.op) { - case FenceOperation::Acquire: - WaitFence(regs.fence_action.syncpoint_id, regs.fence_value); - break; - case FenceOperation::Increment: - IncrementSyncPoint(regs.fence_action.syncpoint_id); - break; - default: - UNIMPLEMENTED_MSG("Unimplemented operation {}", regs.fence_action.op.Value()); - } -} - -void GPU::ProcessWaitForInterruptMethod() { - // TODO(bunnei) ImplementMe - LOG_WARNING(HW_GPU, "(STUBBED) called"); -} - -void GPU::ProcessSemaphoreTriggerMethod() { - const auto semaphoreOperationMask = 0xF; - const auto op = - static_cast(regs.semaphore_trigger & semaphoreOperationMask); - if (op == GpuSemaphoreOperation::WriteLong) { - struct Block { - u32 sequence; - u32 zeros = 0; - u64 timestamp; - }; - - Block block{}; - block.sequence = regs.semaphore_sequence; - // TODO(Kmather73): Generate a real GPU timestamp and write it here instead of - // CoreTiming - block.timestamp = GetTicks(); - memory_manager->WriteBlock(regs.semaphore_address.SemaphoreAddress(), &block, - sizeof(block)); - } else { - const u32 word{memory_manager->Read(regs.semaphore_address.SemaphoreAddress())}; - if ((op == GpuSemaphoreOperation::AcquireEqual && word == regs.semaphore_sequence) || - (op == GpuSemaphoreOperation::AcquireGequal && - static_cast(word - regs.semaphore_sequence) > 0) || - (op == GpuSemaphoreOperation::AcquireMask && (word & regs.semaphore_sequence))) { - // Nothing to do in this case - } else { - regs.acquire_source = true; - regs.acquire_value = regs.semaphore_sequence; - if (op == GpuSemaphoreOperation::AcquireEqual) { - regs.acquire_active = true; - regs.acquire_mode = false; - } else if (op == GpuSemaphoreOperation::AcquireGequal) { - regs.acquire_active = true; - regs.acquire_mode = true; - } else if (op == GpuSemaphoreOperation::AcquireMask) { - // TODO(kemathe) The acquire mask operation waits for a value that, ANDed with - // semaphore_sequence, gives a non-0 result - LOG_ERROR(HW_GPU, "Invalid semaphore operation AcquireMask not implemented"); - } else { - LOG_ERROR(HW_GPU, "Invalid semaphore operation"); - } - } - } -} - -void GPU::ProcessSemaphoreRelease() { - memory_manager->Write(regs.semaphore_address.SemaphoreAddress(), regs.semaphore_release); -} - -void GPU::ProcessSemaphoreAcquire() { - const u32 word = memory_manager->Read(regs.semaphore_address.SemaphoreAddress()); - const auto value = regs.semaphore_acquire; - if (word != value) { - regs.acquire_active = true; - regs.acquire_value = value; - // TODO(kemathe73) figure out how to do the acquire_timeout - regs.acquire_mode = false; - regs.acquire_source = false; - } -} - -void GPU::Start() { - gpu_thread.StartThread(*renderer, renderer->Context(), *dma_pusher); - cpu_context = renderer->GetRenderWindow().CreateSharedContext(); - cpu_context->MakeCurrent(); -} - -void GPU::ObtainContext() { - cpu_context->MakeCurrent(); -} - -void GPU::ReleaseContext() { - cpu_context->DoneCurrent(); -} - -void GPU::PushGPUEntries(Tegra::CommandList&& entries) { - gpu_thread.SubmitList(std::move(entries)); -} - -void GPU::PushCommandBuffer(Tegra::ChCommandHeaderList& entries) { - if (!use_nvdec) { - return; - } - - if (!cdma_pusher) { - cdma_pusher = std::make_unique(*this); - } - - // SubmitCommandBuffer would make the nvdec operations async, this is not currently working - // TODO(ameerj): RE proper async nvdec operation - // gpu_thread.SubmitCommandBuffer(std::move(entries)); - - cdma_pusher->ProcessEntries(std::move(entries)); -} - -void GPU::ClearCdmaInstance() { - cdma_pusher.reset(); -} - -void GPU::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) { - gpu_thread.SwapBuffers(framebuffer); -} - -void GPU::FlushRegion(VAddr addr, u64 size) { - gpu_thread.FlushRegion(addr, size); -} - -void GPU::InvalidateRegion(VAddr addr, u64 size) { - gpu_thread.InvalidateRegion(addr, size); -} - -void GPU::FlushAndInvalidateRegion(VAddr addr, u64 size) { - gpu_thread.FlushAndInvalidateRegion(addr, size); -} - -void GPU::TriggerCpuInterrupt(const u32 syncpoint_id, const u32 value) const { - auto& interrupt_manager = system.InterruptManager(); - interrupt_manager.GPUInterruptSyncpt(syncpoint_id, value); +void GPU::SyncGuestHost() { + impl->SyncGuestHost(); } void GPU::OnCommandListEnd() { - if (is_async) { - // This command only applies to asynchronous GPU mode - gpu_thread.OnCommandListEnd(); - } + impl->OnCommandListEnd(); +} + +u64 GPU::RequestFlush(VAddr addr, std::size_t size) { + return impl->RequestFlush(addr, size); +} + +u64 GPU::CurrentFlushRequestFence() const { + return impl->CurrentFlushRequestFence(); +} + +void GPU::TickWork() { + impl->TickWork(); +} + +Engines::Maxwell3D& GPU::Maxwell3D() { + return impl->Maxwell3D(); +} + +const Engines::Maxwell3D& GPU::Maxwell3D() const { + return impl->Maxwell3D(); +} + +Engines::KeplerCompute& GPU::KeplerCompute() { + return impl->KeplerCompute(); +} + +const Engines::KeplerCompute& GPU::KeplerCompute() const { + return impl->KeplerCompute(); +} + +Tegra::MemoryManager& GPU::MemoryManager() { + return impl->MemoryManager(); +} + +const Tegra::MemoryManager& GPU::MemoryManager() const { + return impl->MemoryManager(); +} + +Tegra::DmaPusher& GPU::DmaPusher() { + return impl->DmaPusher(); +} + +const Tegra::DmaPusher& GPU::DmaPusher() const { + return impl->DmaPusher(); +} + +Tegra::CDmaPusher& GPU::CDmaPusher() { + return impl->CDmaPusher(); +} + +const Tegra::CDmaPusher& GPU::CDmaPusher() const { + return impl->CDmaPusher(); +} + +VideoCore::RendererBase& GPU::Renderer() { + return impl->Renderer(); +} + +const VideoCore::RendererBase& GPU::Renderer() const { + return impl->Renderer(); +} + +VideoCore::ShaderNotify& GPU::ShaderNotify() { + return impl->ShaderNotify(); +} + +const VideoCore::ShaderNotify& GPU::ShaderNotify() const { + return impl->ShaderNotify(); +} + +void GPU::WaitFence(u32 syncpoint_id, u32 value) { + impl->WaitFence(syncpoint_id, value); +} + +void GPU::IncrementSyncPoint(u32 syncpoint_id) { + impl->IncrementSyncPoint(syncpoint_id); +} + +u32 GPU::GetSyncpointValue(u32 syncpoint_id) const { + return impl->GetSyncpointValue(syncpoint_id); +} + +void GPU::RegisterSyncptInterrupt(u32 syncpoint_id, u32 value) { + impl->RegisterSyncptInterrupt(syncpoint_id, value); +} + +bool GPU::CancelSyncptInterrupt(u32 syncpoint_id, u32 value) { + return impl->CancelSyncptInterrupt(syncpoint_id, value); +} + +u64 GPU::GetTicks() const { + return impl->GetTicks(); +} + +bool GPU::IsAsync() const { + return impl->IsAsync(); +} + +bool GPU::UseNvdec() const { + return impl->UseNvdec(); +} + +void GPU::RendererFrameEndNotify() { + impl->RendererFrameEndNotify(); +} + +void GPU::Start() { + impl->Start(); +} + +void GPU::ObtainContext() { + impl->ObtainContext(); +} + +void GPU::ReleaseContext() { + impl->ReleaseContext(); +} + +void GPU::PushGPUEntries(Tegra::CommandList&& entries) { + impl->PushGPUEntries(std::move(entries)); +} + +void GPU::PushCommandBuffer(Tegra::ChCommandHeaderList& entries) { + impl->PushCommandBuffer(entries); +} + +void GPU::ClearCdmaInstance() { + impl->ClearCdmaInstance(); +} + +void GPU::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) { + impl->SwapBuffers(framebuffer); +} + +void GPU::FlushRegion(VAddr addr, u64 size) { + impl->FlushRegion(addr, size); +} + +void GPU::InvalidateRegion(VAddr addr, u64 size) { + impl->InvalidateRegion(addr, size); +} + +void GPU::FlushAndInvalidateRegion(VAddr addr, u64 size) { + impl->FlushAndInvalidateRegion(addr, size); } } // namespace Tegra diff --git a/src/video_core/gpu.h b/src/video_core/gpu.h index e6a02a71b..a2bbfd99e 100755 --- a/src/video_core/gpu.h +++ b/src/video_core/gpu.h @@ -3,29 +3,12 @@ // Refer to the license.txt file included. #pragma once - -#include -#include -#include -#include #include -#include + +#include "common/bit_field.h" #include "common/common_types.h" -#include "core/hle/service/nvdrv/nvdata.h" -#include "core/hle/service/nvflinger/buffer_queue.h" #include "video_core/cdma_pusher.h" -#include "video_core/dma_pusher.h" #include "video_core/framebuffer_config.h" -#include "video_core/gpu_thread.h" - -using CacheAddr = std::uintptr_t; -[[nodiscard]] inline CacheAddr ToCacheAddr(const void* host_ptr) { - return reinterpret_cast(host_ptr); -} - -[[nodiscard]] inline u8* FromCacheAddr(CacheAddr cache_addr) { - return reinterpret_cast(cache_addr); -} namespace Core { namespace Frontend { @@ -40,6 +23,9 @@ class ShaderNotify; } // namespace VideoCore namespace Tegra { +class DmaPusher; +class CDmaPusher; +struct CommandList; enum class RenderTargetFormat : u32 { NONE = 0x0, @@ -138,7 +124,18 @@ public: } }; - explicit GPU(Core::System& system_, bool is_async_, bool use_nvdec_); + enum class FenceOperation : u32 { + Acquire = 0, + Increment = 1, + }; + + union FenceAction { + u32 raw; + BitField<0, 1, FenceOperation> op; + BitField<8, 24, u32> syncpoint_id; + }; + + explicit GPU(Core::System& system, bool is_async, bool use_nvdec); ~GPU(); /// Binds a renderer to the GPU. @@ -162,9 +159,7 @@ public: [[nodiscard]] u64 RequestFlush(VAddr addr, std::size_t size); /// Obtains current flush request fence id. - [[nodiscard]] u64 CurrentFlushRequestFence() const { - return current_flush_fence.load(std::memory_order_relaxed); - } + [[nodiscard]] u64 CurrentFlushRequestFence() const; /// Tick pending requests within the GPU. void TickWork(); @@ -200,24 +195,16 @@ public: [[nodiscard]] const Tegra::CDmaPusher& CDmaPusher() const; /// Returns a reference to the underlying renderer. - [[nodiscard]] VideoCore::RendererBase& Renderer() { - return *renderer; - } + [[nodiscard]] VideoCore::RendererBase& Renderer(); /// Returns a const reference to the underlying renderer. - [[nodiscard]] const VideoCore::RendererBase& Renderer() const { - return *renderer; - } + [[nodiscard]] const VideoCore::RendererBase& Renderer() const; /// Returns a reference to the shader notifier. - [[nodiscard]] VideoCore::ShaderNotify& ShaderNotify() { - return *shader_notify; - } + [[nodiscard]] VideoCore::ShaderNotify& ShaderNotify(); /// Returns a const reference to the shader notifier. - [[nodiscard]] const VideoCore::ShaderNotify& ShaderNotify() const { - return *shader_notify; - } + [[nodiscard]] const VideoCore::ShaderNotify& ShaderNotify() const; /// Allows the CPU/NvFlinger to wait on the GPU before presenting a frame. void WaitFence(u32 syncpoint_id, u32 value); @@ -232,80 +219,12 @@ public: [[nodiscard]] u64 GetTicks() const; - [[nodiscard]] std::unique_lock LockSync() { - return std::unique_lock{sync_mutex}; - } + [[nodiscard]] bool IsAsync() const; - [[nodiscard]] bool IsAsync() const { - return is_async; - } - - [[nodiscard]] bool UseNvdec() const { - return use_nvdec; - } + [[nodiscard]] bool UseNvdec() const; void RendererFrameEndNotify(); - enum class FenceOperation : u32 { - Acquire = 0, - Increment = 1, - }; - - union FenceAction { - u32 raw; - BitField<0, 1, FenceOperation> op; - BitField<8, 24, u32> syncpoint_id; - - [[nodiscard]] static CommandHeader Build(FenceOperation op, u32 syncpoint_id) { - FenceAction result{}; - result.op.Assign(op); - result.syncpoint_id.Assign(syncpoint_id); - return {result.raw}; - } - }; - - struct Regs { - static constexpr size_t NUM_REGS = 0x40; - - union { - struct { - INSERT_PADDING_WORDS_NOINIT(0x4); - struct { - u32 address_high; - u32 address_low; - - [[nodiscard]] GPUVAddr SemaphoreAddress() const { - return static_cast((static_cast(address_high) << 32) | - address_low); - } - } semaphore_address; - - u32 semaphore_sequence; - u32 semaphore_trigger; - INSERT_PADDING_WORDS_NOINIT(0xC); - - // The pusher and the puller share the reference counter, the pusher only has read - // access - u32 reference_count; - INSERT_PADDING_WORDS_NOINIT(0x5); - - u32 semaphore_acquire; - u32 semaphore_release; - u32 fence_value; - FenceAction fence_action; - INSERT_PADDING_WORDS_NOINIT(0xE2); - - // Puller state - u32 acquire_mode; - u32 acquire_source; - u32 acquire_active; - u32 acquire_timeout; - u32 acquire_value; - }; - std::array reg_array; - }; - } regs{}; - /// Performs any additional setup necessary in order to begin GPU emulation. /// This can be used to launch any necessary threads and register any necessary /// core timing events. @@ -338,104 +257,9 @@ public: /// Notify rasterizer that any caches of the specified region should be flushed and invalidated void FlushAndInvalidateRegion(VAddr addr, u64 size); -protected: - void TriggerCpuInterrupt(u32 syncpoint_id, u32 value) const; - private: - void ProcessBindMethod(const MethodCall& method_call); - void ProcessFenceActionMethod(); - void ProcessWaitForInterruptMethod(); - void ProcessSemaphoreTriggerMethod(); - void ProcessSemaphoreRelease(); - void ProcessSemaphoreAcquire(); - - /// Calls a GPU puller method. - void CallPullerMethod(const MethodCall& method_call); - - /// Calls a GPU engine method. - void CallEngineMethod(const MethodCall& method_call); - - /// Calls a GPU engine multivalue method. - void CallEngineMultiMethod(u32 method, u32 subchannel, const u32* base_start, u32 amount, - u32 methods_pending); - - /// Determines where the method should be executed. - [[nodiscard]] bool ExecuteMethodOnEngine(u32 method); - -protected: - Core::System& system; - std::unique_ptr memory_manager; - std::unique_ptr dma_pusher; - std::unique_ptr cdma_pusher; - std::unique_ptr renderer; - VideoCore::RasterizerInterface* rasterizer = nullptr; - const bool use_nvdec; - -private: - /// Mapping of command subchannels to their bound engine ids - std::array bound_engines = {}; - /// 3D engine - std::unique_ptr maxwell_3d; - /// 2D engine - std::unique_ptr fermi_2d; - /// Compute engine - std::unique_ptr kepler_compute; - /// DMA engine - std::unique_ptr maxwell_dma; - /// Inline memory engine - std::unique_ptr kepler_memory; - /// Shader build notifier - std::unique_ptr shader_notify; - /// When true, we are about to shut down emulation session, so terminate outstanding tasks - std::atomic_bool shutting_down{}; - - std::array, Service::Nvidia::MaxSyncPoints> syncpoints{}; - - std::array, Service::Nvidia::MaxSyncPoints> syncpt_interrupts; - - std::mutex sync_mutex; - std::mutex device_mutex; - - std::condition_variable sync_cv; - - struct FlushRequest { - explicit FlushRequest(u64 fence_, VAddr addr_, std::size_t size_) - : fence{fence_}, addr{addr_}, size{size_} {} - u64 fence; - VAddr addr; - std::size_t size; - }; - - std::list flush_requests; - std::atomic current_flush_fence{}; - u64 last_flush_fence{}; - std::mutex flush_request_mutex; - - const bool is_async; - - VideoCommon::GPUThread::ThreadManager gpu_thread; - std::unique_ptr cpu_context; + struct Impl; + std::unique_ptr impl; }; -#define ASSERT_REG_POSITION(field_name, position) \ - static_assert(offsetof(GPU::Regs, field_name) == position * 4, \ - "Field " #field_name " has invalid position") - -ASSERT_REG_POSITION(semaphore_address, 0x4); -ASSERT_REG_POSITION(semaphore_sequence, 0x6); -ASSERT_REG_POSITION(semaphore_trigger, 0x7); -ASSERT_REG_POSITION(reference_count, 0x14); -ASSERT_REG_POSITION(semaphore_acquire, 0x1A); -ASSERT_REG_POSITION(semaphore_release, 0x1B); -ASSERT_REG_POSITION(fence_value, 0x1C); -ASSERT_REG_POSITION(fence_action, 0x1D); - -ASSERT_REG_POSITION(acquire_mode, 0x100); -ASSERT_REG_POSITION(acquire_source, 0x101); -ASSERT_REG_POSITION(acquire_active, 0x102); -ASSERT_REG_POSITION(acquire_timeout, 0x103); -ASSERT_REG_POSITION(acquire_value, 0x104); - -#undef ASSERT_REG_POSITION - } // namespace Tegra diff --git a/src/video_core/gpu_thread.h b/src/video_core/gpu_thread.h index 91bada925..00984188e 100755 --- a/src/video_core/gpu_thread.h +++ b/src/video_core/gpu_thread.h @@ -130,9 +130,6 @@ public: /// Notify rasterizer that any caches of the specified region should be flushed and invalidated void FlushAndInvalidateRegion(VAddr addr, u64 size); - // Stops the GPU execution and waits for the GPU to finish working - void ShutDown(); - void OnCommandListEnd(); private: diff --git a/src/video_core/query_cache.h b/src/video_core/query_cache.h index a19b9f931..392f82eb7 100755 --- a/src/video_core/query_cache.h +++ b/src/video_core/query_cache.h @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include diff --git a/src/video_core/shader_environment.cpp b/src/video_core/shader_environment.cpp index 81a878bb2..05850afd0 100755 --- a/src/video_core/shader_environment.cpp +++ b/src/video_core/shader_environment.cpp @@ -16,6 +16,7 @@ #include "common/fs/fs.h" #include "common/logging/log.h" #include "shader_recompiler/environment.h" +#include "video_core/engines/kepler_compute.h" #include "video_core/memory_manager.h" #include "video_core/shader_environment.h" #include "video_core/textures/texture.h" diff --git a/src/video_core/shader_environment.h b/src/video_core/shader_environment.h index 2079979db..6640e53d0 100755 --- a/src/video_core/shader_environment.h +++ b/src/video_core/shader_environment.h @@ -5,13 +5,13 @@ #pragma once #include -#include #include #include #include #include #include #include +#include #include #include #include @@ -19,9 +19,7 @@ #include "common/common_types.h" #include "common/unique_function.h" #include "shader_recompiler/environment.h" -#include "video_core/engines/kepler_compute.h" #include "video_core/engines/maxwell_3d.h" -#include "video_core/textures/texture.h" namespace Tegra { class Memorymanager; diff --git a/src/video_core/texture_cache/texture_cache_base.h b/src/video_core/texture_cache/texture_cache_base.h index d7528ed24..423402128 100755 --- a/src/video_core/texture_cache/texture_cache_base.h +++ b/src/video_core/texture_cache/texture_cache_base.h @@ -4,13 +4,13 @@ #pragma once -#include #include #include #include #include #include #include +#include #include "common/common_types.h" #include "common/literals.h"