From 95adf299e4a9706c657627735f6e40fb3dafd729 Mon Sep 17 00:00:00 2001 From: Morph <39850852+Morph1984@users.noreply.github.com> Date: Tue, 21 Mar 2023 21:13:03 -0400 Subject: [PATCH 1/5] x64: cpu_detect: Add detection of waitpkg instructions waitpkg introduces 3 instructions, UMONITOR, UMWAIT and TPAUSE. --- src/common/x64/cpu_detect.cpp | 1 + src/common/x64/cpu_detect.h | 1 + 2 files changed, 2 insertions(+) diff --git a/src/common/x64/cpu_detect.cpp b/src/common/x64/cpu_detect.cpp index e54383a4a8..72ed6e96c9 100644 --- a/src/common/x64/cpu_detect.cpp +++ b/src/common/x64/cpu_detect.cpp @@ -144,6 +144,7 @@ static CPUCaps Detect() { caps.bmi2 = Common::Bit<8>(cpu_id[1]); caps.sha = Common::Bit<29>(cpu_id[1]); + caps.waitpkg = Common::Bit<5>(cpu_id[2]); caps.gfni = Common::Bit<8>(cpu_id[2]); __cpuidex(cpu_id, 0x00000007, 0x00000001); diff --git a/src/common/x64/cpu_detect.h b/src/common/x64/cpu_detect.h index ca8db19d63..8253944d6b 100644 --- a/src/common/x64/cpu_detect.h +++ b/src/common/x64/cpu_detect.h @@ -67,6 +67,7 @@ struct CPUCaps { bool pclmulqdq : 1; bool popcnt : 1; bool sha : 1; + bool waitpkg : 1; }; /** From d260571440e16ce9b4a60aa6c93bf79dc95169a2 Mon Sep 17 00:00:00 2001 From: Morph <39850852+Morph1984@users.noreply.github.com> Date: Tue, 21 Mar 2023 21:28:38 -0400 Subject: [PATCH 2/5] x64: Add MicroSleep MicroSleep allows the processor to pause for a "short" amount of time (in the microsecond range). This is useful for spin-waiting that does not require nanosecond precision. This uses the new TPAUSE instruction introduced on Intel's newest processors as part of the waitpkg instructions. For CPUs that do not support waitpkg instructions, this is equivalent to yield(). Co-Authored-By: liamwhite --- src/common/CMakeLists.txt | 2 ++ src/common/x64/cpu_wait.cpp | 72 +++++++++++++++++++++++++++++++++++++ src/common/x64/cpu_wait.h | 10 ++++++ 3 files changed, 84 insertions(+) create mode 100644 src/common/x64/cpu_wait.cpp create mode 100644 src/common/x64/cpu_wait.h diff --git a/src/common/CMakeLists.txt b/src/common/CMakeLists.txt index c1d2b24a14..13ed68b3f0 100644 --- a/src/common/CMakeLists.txt +++ b/src/common/CMakeLists.txt @@ -160,6 +160,8 @@ if(ARCHITECTURE_x86_64) PRIVATE x64/cpu_detect.cpp x64/cpu_detect.h + x64/cpu_wait.cpp + x64/cpu_wait.h x64/native_clock.cpp x64/native_clock.h x64/xbyak_abi.h diff --git a/src/common/x64/cpu_wait.cpp b/src/common/x64/cpu_wait.cpp new file mode 100644 index 0000000000..1fab0bfe8a --- /dev/null +++ b/src/common/x64/cpu_wait.cpp @@ -0,0 +1,72 @@ +// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#include + +#ifdef _MSC_VER +#include +#endif + +#include "common/x64/cpu_detect.h" +#include "common/x64/cpu_wait.h" + +namespace Common::X64 { + +#ifdef _MSC_VER +__forceinline static u64 FencedRDTSC() { + _mm_lfence(); + _ReadWriteBarrier(); + const u64 result = __rdtsc(); + _mm_lfence(); + _ReadWriteBarrier(); + return result; +} + +__forceinline static void TPAUSE() { + // 100,000 cycles is a reasonable amount of time to wait to save on CPU resources. + // For reference: + // At 1 GHz, 100K cycles is 100us + // At 2 GHz, 100K cycles is 50us + // At 4 GHz, 100K cycles is 25us + static constexpr auto PauseCycles = 100'000; + _tpause(0, FencedRDTSC() + PauseCycles); +} +#else +static u64 FencedRDTSC() { + u64 result; + asm volatile("lfence\n\t" + "rdtsc\n\t" + "shl $32, %%rdx\n\t" + "or %%rdx, %0\n\t" + "lfence" + : "=a"(result) + : + : "rdx", "memory", "cc"); + return result; +} + +static void TPAUSE() { + // 100,000 cycles is a reasonable amount of time to wait to save on CPU resources. + // For reference: + // At 1 GHz, 100K cycles is 100us + // At 2 GHz, 100K cycles is 50us + // At 4 GHz, 100K cycles is 25us + static constexpr auto PauseCycles = 100'000; + const auto tsc = FencedRDTSC() + PauseCycles; + const auto eax = static_cast(tsc & 0xFFFFFFFF); + const auto edx = static_cast(tsc >> 32); + asm volatile("tpause %0" : : "r"(0), "d"(edx), "a"(eax)); +} +#endif + +void MicroSleep() { + static const bool has_waitpkg = GetCPUCaps().waitpkg; + + if (has_waitpkg) { + TPAUSE(); + } else { + std::this_thread::yield(); + } +} + +} // namespace Common::X64 diff --git a/src/common/x64/cpu_wait.h b/src/common/x64/cpu_wait.h new file mode 100644 index 0000000000..99d3757a76 --- /dev/null +++ b/src/common/x64/cpu_wait.h @@ -0,0 +1,10 @@ +// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#pragma once + +namespace Common::X64 { + +void MicroSleep(); + +} // namespace Common::X64 From e67edd4bb76f832406e1febe79af05eae6493bc8 Mon Sep 17 00:00:00 2001 From: Morph <39850852+Morph1984@users.noreply.github.com> Date: Tue, 21 Mar 2023 21:30:02 -0400 Subject: [PATCH 3/5] core_timing: Make use of MicroSleep for x64 CPUs For CPUs that support tpause, this should result in significant CPU power savings over thread yield in this spin wait. --- src/core/core_timing.cpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/core/core_timing.cpp b/src/core/core_timing.cpp index cd4df45228..4f2692b05c 100644 --- a/src/core/core_timing.cpp +++ b/src/core/core_timing.cpp @@ -10,6 +10,10 @@ #include "common/windows/timer_resolution.h" #endif +#ifdef ARCHITECTURE_x86_64 +#include "common/x64/cpu_wait.h" +#endif + #include "common/microprofile.h" #include "core/core_timing.h" #include "core/core_timing_util.h" @@ -269,7 +273,11 @@ void CoreTiming::ThreadLoop() { if (wait_time >= timer_resolution_ns) { Common::Windows::SleepForOneTick(); } else { +#ifdef ARCHITECTURE_x86_64 + Common::X64::MicroSleep(); +#else std::this_thread::yield(); +#endif } } From fa3904acd9d01b9a9e2f7d94cf307faaf3938350 Mon Sep 17 00:00:00 2001 From: Morph <39850852+Morph1984@users.noreply.github.com> Date: Wed, 22 Mar 2023 16:40:41 -0400 Subject: [PATCH 4/5] x64: Simplify RDTSC on non-MSVC compilers Co-Authored-By: liamwhite --- src/common/x64/cpu_wait.cpp | 13 +++++-------- src/common/x64/native_clock.cpp | 13 +++++-------- 2 files changed, 10 insertions(+), 16 deletions(-) diff --git a/src/common/x64/cpu_wait.cpp b/src/common/x64/cpu_wait.cpp index 1fab0bfe8a..cfeef6a3d3 100644 --- a/src/common/x64/cpu_wait.cpp +++ b/src/common/x64/cpu_wait.cpp @@ -33,16 +33,13 @@ __forceinline static void TPAUSE() { } #else static u64 FencedRDTSC() { - u64 result; + u64 eax; + u64 edx; asm volatile("lfence\n\t" "rdtsc\n\t" - "shl $32, %%rdx\n\t" - "or %%rdx, %0\n\t" - "lfence" - : "=a"(result) - : - : "rdx", "memory", "cc"); - return result; + "lfence\n\t" + : "=a"(eax), "=d"(edx)); + return (edx << 32) | eax; } static void TPAUSE() { diff --git a/src/common/x64/native_clock.cpp b/src/common/x64/native_clock.cpp index 76c66e7eed..277b006625 100644 --- a/src/common/x64/native_clock.cpp +++ b/src/common/x64/native_clock.cpp @@ -27,16 +27,13 @@ __forceinline static u64 FencedRDTSC() { } #else static u64 FencedRDTSC() { - u64 result; + u64 eax; + u64 edx; asm volatile("lfence\n\t" "rdtsc\n\t" - "shl $32, %%rdx\n\t" - "or %%rdx, %0\n\t" - "lfence" - : "=a"(result) - : - : "rdx", "memory", "cc"); - return result; + "lfence\n\t" + : "=a"(eax), "=d"(edx)); + return (edx << 32) | eax; } #endif From 164d930c8d12626239195621461a96f45b7aa5c4 Mon Sep 17 00:00:00 2001 From: Morph <39850852+Morph1984@users.noreply.github.com> Date: Thu, 23 Mar 2023 20:37:51 -0400 Subject: [PATCH 5/5] telemetry: Add waitpkg instruction --- src/common/telemetry.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/common/telemetry.cpp b/src/common/telemetry.cpp index d263943590..91352912d5 100644 --- a/src/common/telemetry.cpp +++ b/src/common/telemetry.cpp @@ -97,6 +97,7 @@ void AppendCPUInfo(FieldCollection& fc) { add_field("CPU_Extension_x64_PCLMULQDQ", caps.pclmulqdq); add_field("CPU_Extension_x64_POPCNT", caps.popcnt); add_field("CPU_Extension_x64_SHA", caps.sha); + add_field("CPU_Extension_x64_WAITPKG", caps.waitpkg); #else fc.AddField(FieldType::UserSystem, "CPU_Model", "Other"); #endif