WIP: Enhance shader compilation performance and control

This commit adds new settings and optimizations for shader compilation: - Add new settings: - use_enhanced_shader_building: Enable enhanced shader compilation - shader_compilation_priority: Control shader compilation priority - Improve shader compilation performance: - Optimize worker thread allocation based on CPU cores - Add smarter async shader compilation heuristics - Prioritize vertex and fragment shader compilation - Add performance tracking and logging - Add performance monitoring: - Track shader compilation times - Log slow shader compilations - Monitor async shader compilation statistics This is a work in progress commit. Further optimizations and refinements will be needed based on testing and feedback. Signed-off-by: Zephyron <zephyron@citron-emu.org>
2025-10-19 21:47:54 +00:00 · 2025-03-27 20:56:23 +10:00 · 2025-03-27 20:56:23 +10:00 · c57a5fef92
commit c57a5fef92
parent bc86307ad6
8 changed files with 249 additions and 33 deletions
--- a/src/video_core/renderer_opengl/gl_graphics_pipeline.cpp
+++ b/src/video_core/renderer_opengl/gl_graphics_pipeline.cpp
@ -1,10 +1,13 @@
-// SPDX-FileCopyrightText: Copyright 2021 yuzu Emulator Project
+// SPDX-FileCopyrightText: Copyright 2021 yuzu Emulator Project\
+// SPDX-FileCopyrightText: Copyright 2025 Citron Emulator Project
 // SPDX-License-Identifier: GPL-2.0-or-later

 #include <algorithm>
 #include <array>
 #include <string>
 #include <vector>
+#include <chrono>
+#include <functional>

 #include "common/settings.h" // for enum class Settings::ShaderBackend
 #include "common/thread_worker.h"
@ -234,26 +237,68 @@ GraphicsPipeline::GraphicsPipeline(const Device& device, TextureCache& texture_c
    auto func{[this, sources_ = std::move(sources), sources_spirv_ = std::move(sources_spirv),
               shader_notify, backend, in_parallel,
               force_context_flush](ShaderContext::Context*) mutable {
+        // Track time for shader compilation for possible performance tuning
+        const auto start_time = std::chrono::high_resolution_clock::now();
+
+        // Prepare compilation steps for all shader stages
+        std::vector<std::function<void()>> compilation_steps;
+        compilation_steps.reserve(5); // Maximum number of shader stages
+
+        // Prepare all compilation steps first to better distribute work
        for (size_t stage = 0; stage < 5; ++stage) {
            switch (backend) {
            case Settings::ShaderBackend::Glsl:
                if (!sources_[stage].empty()) {
-                    source_programs[stage] = CreateProgram(sources_[stage], Stage(stage));
+                    compilation_steps.emplace_back([this, stage, source = sources_[stage]]() {
+                        source_programs[stage] = CreateProgram(source, Stage(stage));
+                    });
                }
                break;
            case Settings::ShaderBackend::Glasm:
                if (!sources_[stage].empty()) {
-                    assembly_programs[stage] =
-                        CompileProgram(sources_[stage], AssemblyStage(stage));
+                    compilation_steps.emplace_back([this, stage, source = sources_[stage]]() {
+                        assembly_programs[stage] = CompileProgram(source, AssemblyStage(stage));
+                    });
                }
                break;
            case Settings::ShaderBackend::SpirV:
                if (!sources_spirv_[stage].empty()) {
-                    source_programs[stage] = CreateProgram(sources_spirv_[stage], Stage(stage));
+                    compilation_steps.emplace_back([this, stage, source = sources_spirv_[stage]]() {
+                        source_programs[stage] = CreateProgram(source, Stage(stage));
+                    });
                }
                break;
            }
        }
+
+        // If we're running in parallel, use high-priority execution for vertex and fragment shaders
+        // as these are typically needed first by the renderer
+        if (in_parallel && compilation_steps.size() > 1) {
+            // Execute vertex (0) and fragment (4) shaders first if they exist
+            for (size_t priority_stage : {0, 4}) {
+                for (size_t i = 0; i < compilation_steps.size(); ++i) {
+                    if ((i == priority_stage || (priority_stage == 0 && i <= 1)) && i < compilation_steps.size()) {
+                        compilation_steps[i]();
+                        compilation_steps[i] = [](){}; // Mark as executed
+                    }
+                }
+            }
+        }
+
+        // Execute all remaining compilation steps
+        for (auto& step : compilation_steps) {
+            step(); // Will do nothing for already executed steps
+        }
+
+        // Performance measurement for possible logging or optimization
+        const auto end_time = std::chrono::high_resolution_clock::now();
+        const auto compilation_time = std::chrono::duration_cast<std::chrono::milliseconds>(
+            end_time - start_time).count();
+
+        if (compilation_time > 50) { // Only log slow compilations
+            LOG_DEBUG(Render_OpenGL, "Shader compilation took {}ms", compilation_time);
+        }
+
        if (force_context_flush || in_parallel) {
            std::scoped_lock lock{built_mutex};
            built_fence.Create();
@ -623,15 +668,41 @@ void GraphicsPipeline::WaitForBuild() {
    is_built = true;
 }

-bool GraphicsPipeline::IsBuilt() noexcept {
+bool GraphicsPipeline::IsBuilt() const noexcept {
    if (is_built) {
        return true;
    }
-    if (built_fence.handle == 0) {
+    if (!built_fence.handle) {
        return false;
    }
-    is_built = built_fence.IsSignaled();
-    return is_built;
+
+    // Check if the async build has finished by polling the fence
+    const GLsync sync = built_fence.handle;
+    const GLuint result = glClientWaitSync(sync, 0, 0);
+    if (result == GL_ALREADY_SIGNALED || result == GL_CONDITION_SATISFIED) {
+        // Mark this as mutable even though we're in a const method - this is
+        // essentially a cached value update which is acceptable
+        const_cast<GraphicsPipeline*>(this)->is_built = true;
+        return true;
+    }
+
+    // For better performance tracking, capture time spent waiting for shaders
+    static thread_local std::chrono::high_resolution_clock::time_point last_shader_wait_log;
+    static thread_local u32 shader_wait_count = 0;
+
+    auto now = std::chrono::high_resolution_clock::now();
+    auto elapsed = std::chrono::duration_cast<std::chrono::seconds>(
+        now - last_shader_wait_log).count();
+
+    // Log shader compilation status periodically to help diagnose performance issues
+    if (elapsed >= 5) { // Log every 5 seconds
+        shader_wait_count++;
+        LOG_DEBUG(Render_OpenGL, "Waiting for async shader compilation... (count={})",
+                 shader_wait_count);
+        last_shader_wait_log = now;
+    }
+
+    return false;
 }

 } // namespace OpenGL
--- a/src/video_core/renderer_opengl/gl_graphics_pipeline.h
+++ b/src/video_core/renderer_opengl/gl_graphics_pipeline.h
@ -1,4 +1,5 @@
 // SPDX-FileCopyrightText: Copyright 2021 yuzu Emulator Project
+// SPDX-FileCopyrightText: Copyright 2025 Citron Emulator Project
 // SPDX-License-Identifier: GPL-2.0-or-later

 #pragma once
@ -102,7 +103,7 @@ public:
        return uses_local_memory;
    }

-    [[nodiscard]] bool IsBuilt() noexcept;
+    [[nodiscard]] bool IsBuilt() const noexcept;

    template <typename Spec>
    static auto MakeConfigureSpecFunc() {
--- a/src/video_core/renderer_opengl/gl_shader_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp
@ -1,4 +1,5 @@
 // SPDX-FileCopyrightText: Copyright 2018 yuzu Emulator Project
+// SPDX-FileCopyrightText: Copyright 2025 Citron Emulator Project
 // SPDX-License-Identifier: GPL-2.0-or-later

 #include <atomic>
@ -608,9 +609,33 @@ std::unique_ptr<ComputePipeline> ShaderCache::CreateComputePipeline(
 }

 std::unique_ptr<ShaderWorker> ShaderCache::CreateWorkers() const {
-    return std::make_unique<ShaderWorker>(std::max(std::thread::hardware_concurrency(), 2U) - 1,
-                                          "GlShaderBuilder",
-                                          [this] { return Context{emu_window}; });
+    // Calculate optimal number of workers based on available CPU cores
+    // Leave at least 1 core for main thread and other operations
+    // Use more cores for more parallelism in shader compilation
+    const u32 num_worker_threads = std::max(std::thread::hardware_concurrency(), 2U);
+    const u32 optimal_workers = num_worker_threads <= 3 ?
+        num_worker_threads - 1 : // On dual/quad core, leave 1 core free
+        num_worker_threads - 2;  // On 6+ core systems, leave 2 cores free for other tasks
+
+    auto worker = std::make_unique<ShaderWorker>(
+        optimal_workers,
+        "GlShaderBuilder",
+        [this] {
+            auto context = Context{emu_window};
+
+            // Apply thread priority based on settings
+            // This allows users to control how aggressive shader compilation is
+            const int priority = Settings::values.shader_compilation_priority.GetValue();
+            if (priority != 0) {
+                Common::SetCurrentThreadPriority(
+                    priority > 0 ? Common::ThreadPriority::High : Common::ThreadPriority::Low);
+            }
+
+            return context;
+        }
+    );
+
+    return worker;
 }

 } // namespace OpenGL