From 86d35caf94c2dad171dc27f82005131b5afed9ef Mon Sep 17 00:00:00 2001 From: Jim Huang Date: Sat, 29 Jan 2022 22:21:01 +0800 Subject: [PATCH] Rewrite __frsqrte with Arm/NEON counterpart __frsqrte is the intrinsic for floating reciprocal square root estimate. In Arm64, we can implement it with NEON intrinsics. Since Armv8.2, instruction "FRSQRTE" [1] is provided to calculate an approximate square root for each vector element in the source SIMD and FP register. With -O3, generated assembly on Apple Silicon M1: [original] fsqrt s0, s0 fmov s1, #1.00000000 fdiv s0, s1, s0 [neon] dup.4s v0, v0[0] frsqrte.4s v0, v0 [1] https://developer.arm.com/documentation/100076/0100/a64-instruction-set-reference/a64-simd-scalar-instructions/frsqrte--scalar- --- src/Pomme.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/Pomme.h b/src/Pomme.h index c3eb024..0ed6823 100644 --- a/src/Pomme.h +++ b/src/Pomme.h @@ -16,7 +16,15 @@ #define __fres(x) (1.0f/x) #define __fabs(x) fabs(x) +#if defined(__aarch64__) +#include +static inline float __frsqrte(float f) +{ + return vrsqrteq_f32(vdupq_n_f32(f))[0]; +} +#else #define __frsqrte(x) (1.0f/sqrtf(x)) +#endif //----------------------------------------------------------------------------- // Source code compat