mirror of
https://github.com/jorio/Pomme.git
synced 2024-06-10 06:29:31 +00:00
Rewrite __frsqrte with Arm/NEON counterpart
__frsqrte is the intrinsic for floating reciprocal square root estimate. In Arm64, we can implement it with NEON intrinsics. Since Armv8.2, instruction "FRSQRTE" [1] is provided to calculate an approximate square root for each vector element in the source SIMD and FP register. With -O3, generated assembly on Apple Silicon M1: [original] fsqrt s0, s0 fmov s1, #1.00000000 fdiv s0, s1, s0 [neon] dup.4s v0, v0[0] frsqrte.4s v0, v0 [1] https://developer.arm.com/documentation/100076/0100/a64-instruction-set-reference/a64-simd-scalar-instructions/frsqrte--scalar-
This commit is contained in:
parent
1b0ea49ded
commit
86d35caf94
|
@ -16,7 +16,15 @@
|
||||||
|
|
||||||
#define __fres(x) (1.0f/x)
|
#define __fres(x) (1.0f/x)
|
||||||
#define __fabs(x) fabs(x)
|
#define __fabs(x) fabs(x)
|
||||||
|
#if defined(__aarch64__)
|
||||||
|
#include <arm_neon.h>
|
||||||
|
static inline float __frsqrte(float f)
|
||||||
|
{
|
||||||
|
return vrsqrteq_f32(vdupq_n_f32(f))[0];
|
||||||
|
}
|
||||||
|
#else
|
||||||
#define __frsqrte(x) (1.0f/sqrtf(x))
|
#define __frsqrte(x) (1.0f/sqrtf(x))
|
||||||
|
#endif
|
||||||
|
|
||||||
//-----------------------------------------------------------------------------
|
//-----------------------------------------------------------------------------
|
||||||
// Source code compat
|
// Source code compat
|
||||||
|
|
Loading…
Reference in New Issue
Block a user