diff --git a/tmk_core/chibios.mk b/tmk_core/chibios.mk
index 7962516a0a..18839710be 100644
--- a/tmk_core/chibios.mk
+++ b/tmk_core/chibios.mk
@@ -211,7 +211,8 @@ CHIBISRC = $(STARTUPSRC) \
        $(BOARDSRC) \
        $(STREAMSSRC) \
        $(CHIBIOS)/os/various/syscalls.c \
-       $(PLATFORM_COMMON_DIR)/syscall-fallbacks.c
+       $(PLATFORM_COMMON_DIR)/syscall-fallbacks.c \
+       $(PLATFORM_COMMON_DIR)/wait.c
 
 # Ensure the ASM files are not subjected to LTO -- it'll strip out interrupt handlers otherwise.
 QUANTUM_LIB_SRC += $(STARTUPASM) $(PORTASM) $(OSALASM) $(PLATFORMASM)
diff --git a/tmk_core/common/chibios/_wait.c b/tmk_core/common/chibios/_wait.c
new file mode 100644
index 0000000000..1fbea2dd5e
--- /dev/null
+++ b/tmk_core/common/chibios/_wait.c
@@ -0,0 +1,89 @@
+/* Copyright 2021 QMK
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef __OPTIMIZE__
+#    pragma message "Compiler optimizations disabled; wait_cpuclock() won't work as designed"
+#endif
+
+#define CLOCK_DELAY_NOP8 "nop\n\t nop\n\t nop\n\t nop\n\t   nop\n\t nop\n\t nop\n\t nop\n\t"
+
+__attribute__((always_inline)) static inline void wait_cpuclock(unsigned int n) { /* n: 1..135 */
+    /* The argument n must be a constant expression.
+     * That way, compiler optimization will remove unnecessary code. */
+    if (n < 1) {
+        return;
+    }
+    if (n > 8) {
+        unsigned int n8 = n / 8;
+        n               = n - n8 * 8;
+        switch (n8) {
+            case 16:
+                asm volatile(CLOCK_DELAY_NOP8::: "memory");
+            case 15:
+                asm volatile(CLOCK_DELAY_NOP8::: "memory");
+            case 14:
+                asm volatile(CLOCK_DELAY_NOP8::: "memory");
+            case 13:
+                asm volatile(CLOCK_DELAY_NOP8::: "memory");
+            case 12:
+                asm volatile(CLOCK_DELAY_NOP8::: "memory");
+            case 11:
+                asm volatile(CLOCK_DELAY_NOP8::: "memory");
+            case 10:
+                asm volatile(CLOCK_DELAY_NOP8::: "memory");
+            case 9:
+                asm volatile(CLOCK_DELAY_NOP8::: "memory");
+            case 8:
+                asm volatile(CLOCK_DELAY_NOP8::: "memory");
+            case 7:
+                asm volatile(CLOCK_DELAY_NOP8::: "memory");
+            case 6:
+                asm volatile(CLOCK_DELAY_NOP8::: "memory");
+            case 5:
+                asm volatile(CLOCK_DELAY_NOP8::: "memory");
+            case 4:
+                asm volatile(CLOCK_DELAY_NOP8::: "memory");
+            case 3:
+                asm volatile(CLOCK_DELAY_NOP8::: "memory");
+            case 2:
+                asm volatile(CLOCK_DELAY_NOP8::: "memory");
+            case 1:
+                asm volatile(CLOCK_DELAY_NOP8::: "memory");
+            case 0:
+                break;
+        }
+    }
+    switch (n) {
+        case 8:
+            asm volatile("nop" ::: "memory");
+        case 7:
+            asm volatile("nop" ::: "memory");
+        case 6:
+            asm volatile("nop" ::: "memory");
+        case 5:
+            asm volatile("nop" ::: "memory");
+        case 4:
+            asm volatile("nop" ::: "memory");
+        case 3:
+            asm volatile("nop" ::: "memory");
+        case 2:
+            asm volatile("nop" ::: "memory");
+        case 1:
+            asm volatile("nop" ::: "memory");
+        case 0:
+            break;
+    }
+}
diff --git a/tmk_core/common/chibios/_wait.h b/tmk_core/common/chibios/_wait.h
index 5bface53e1..4a5172536b 100644
--- a/tmk_core/common/chibios/_wait.h
+++ b/tmk_core/common/chibios/_wait.h
@@ -16,6 +16,7 @@
 #pragma once
 
 #include <ch.h>
+#include <hal.h>
 
 /* chThdSleepX of zero maps to infinite - so we map to a tiny delay to still yield */
 #define wait_ms(ms)                     \
@@ -26,14 +27,19 @@
             chThdSleepMicroseconds(1);  \
         }                               \
     } while (0)
-#define wait_us(us)                     \
-    do {                                \
-        if (us != 0) {                  \
-            chThdSleepMicroseconds(us); \
-        } else {                        \
-            chThdSleepMicroseconds(1);  \
-        }                               \
-    } while (0)
+
+#ifdef WAIT_US_TIMER
+void wait_us(uint16_t duration);
+#else
+#    define wait_us(us)                     \
+        do {                                \
+            if (us != 0) {                  \
+                chThdSleepMicroseconds(us); \
+            } else {                        \
+                chThdSleepMicroseconds(1);  \
+            }                               \
+        } while (0)
+#endif
 
 /* For GPIOs on ARM-based MCUs, the input pins are sampled by the clock of the bus
  * to which the GPIO is connected.
@@ -46,7 +52,7 @@
  * (A fairly large value of 0.25 microseconds is set.)
  */
 
-#include "wait.c"
+#include "_wait.c"
 
 #ifndef GPIO_INPUT_PIN_DELAY
 #    define GPIO_INPUT_PIN_DELAY (STM32_SYSCLK / 1000000L / 4)
diff --git a/tmk_core/common/chibios/wait.c b/tmk_core/common/chibios/wait.c
index c6270fd95e..56fd6ffcec 100644
--- a/tmk_core/common/chibios/wait.c
+++ b/tmk_core/common/chibios/wait.c
@@ -14,76 +14,28 @@
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 
-#ifndef __OPTIMIZE__
-#    pragma message "Compiler optimizations disabled; wait_cpuclock() won't work as designed"
+#include <ch.h>
+#include <hal.h>
+
+#include "_wait.h"
+
+#ifdef WAIT_US_TIMER
+void wait_us(uint16_t duration) {
+    static const GPTConfig gpt_cfg = {1000000, NULL, 0, 0}; /* 1MHz timer, no callback */
+
+    if (duration == 0) {
+        duration = 1;
+    }
+
+    /*
+     * Only use this timer on the main thread;
+     * other threads need to use their own timer.
+     */
+    if (chThdGetSelfX() == &ch.mainthread && duration < (1ULL << (sizeof(gptcnt_t) * 8))) {
+        gptStart(&WAIT_US_TIMER, &gpt_cfg);
+        gptPolledDelay(&WAIT_US_TIMER, duration);
+    } else {
+        chThdSleepMicroseconds(duration);
+    }
+}
 #endif
-
-#define CLOCK_DELAY_NOP8 "nop\n\t nop\n\t nop\n\t nop\n\t   nop\n\t nop\n\t nop\n\t nop\n\t"
-
-__attribute__((always_inline)) static inline void wait_cpuclock(unsigned int n) { /* n: 1..135 */
-    /* The argument n must be a constant expression.
-     * That way, compiler optimization will remove unnecessary code. */
-    if (n < 1) {
-        return;
-    }
-    if (n > 8) {
-        unsigned int n8 = n / 8;
-        n               = n - n8 * 8;
-        switch (n8) {
-            case 16:
-                asm volatile(CLOCK_DELAY_NOP8::: "memory");
-            case 15:
-                asm volatile(CLOCK_DELAY_NOP8::: "memory");
-            case 14:
-                asm volatile(CLOCK_DELAY_NOP8::: "memory");
-            case 13:
-                asm volatile(CLOCK_DELAY_NOP8::: "memory");
-            case 12:
-                asm volatile(CLOCK_DELAY_NOP8::: "memory");
-            case 11:
-                asm volatile(CLOCK_DELAY_NOP8::: "memory");
-            case 10:
-                asm volatile(CLOCK_DELAY_NOP8::: "memory");
-            case 9:
-                asm volatile(CLOCK_DELAY_NOP8::: "memory");
-            case 8:
-                asm volatile(CLOCK_DELAY_NOP8::: "memory");
-            case 7:
-                asm volatile(CLOCK_DELAY_NOP8::: "memory");
-            case 6:
-                asm volatile(CLOCK_DELAY_NOP8::: "memory");
-            case 5:
-                asm volatile(CLOCK_DELAY_NOP8::: "memory");
-            case 4:
-                asm volatile(CLOCK_DELAY_NOP8::: "memory");
-            case 3:
-                asm volatile(CLOCK_DELAY_NOP8::: "memory");
-            case 2:
-                asm volatile(CLOCK_DELAY_NOP8::: "memory");
-            case 1:
-                asm volatile(CLOCK_DELAY_NOP8::: "memory");
-            case 0:
-                break;
-        }
-    }
-    switch (n) {
-        case 8:
-            asm volatile("nop" ::: "memory");
-        case 7:
-            asm volatile("nop" ::: "memory");
-        case 6:
-            asm volatile("nop" ::: "memory");
-        case 5:
-            asm volatile("nop" ::: "memory");
-        case 4:
-            asm volatile("nop" ::: "memory");
-        case 3:
-            asm volatile("nop" ::: "memory");
-        case 2:
-            asm volatile("nop" ::: "memory");
-        case 1:
-            asm volatile("nop" ::: "memory");
-        case 0:
-            break;
-    }
-}
\ No newline at end of file