tls: fix ROL/ROR x86 optimization

ALWAYS_INLINE:

function                                             old     new   delta
psAesInitKey                                         825     824      -1
ROR                                                    5       -      -5
setup_mix2                                           148     134     -14
psAesDecryptBlock                                   1184    1139     -45
psAesEncryptBlock                                   1193    1102     -91
------------------------------------------------------------------------------
(add/remove: 0/1 grow/shrink: 0/4 up/down: 0/-156)           Total: -156 bytes

ALWAYS_INLINE + __builtin_constant_p(shift_cnt):

function                                             old     new   delta
ROR                                                    5       -      -5
psAesInitKey                                         825     818      -7
setup_mix2                                           148     123     -25
psAesDecryptBlock                                   1184    1078    -106
psAesEncryptBlock                                   1193    1017    -176
------------------------------------------------------------------------------
(add/remove: 0/1 grow/shrink: 0/4 up/down: 0/-319)           Total: -319 bytes

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
This commit is contained in:
Denys Vlasenko 2017-01-19 16:45:41 +01:00
parent 432f1ae2ff
commit f7806f9d8f

View File

@ -7,9 +7,6 @@
/* The part below is a section of matrixssl-3-7-2b-open/crypto/cryptolib.h
* Changes are flagged with //bbox
* TODO:
* Take a look at "roll %%cl" part... rotates by constant use fewer registers,
* and on many Intel CPUs rotates by %cl are slower: they take 2 cycles, not 1.
*/
/******************************************************************************/
@ -28,16 +25,28 @@
#elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) && \
!defined(INTEL_CC) && !defined(PS_NO_ASM)
static inline unsigned ROL(unsigned word, int i)
static ALWAYS_INLINE unsigned ROL(unsigned word, int i)
{
if (__builtin_constant_p(i)) //box
// Rotates by constant use fewer registers,
// and on many Intel CPUs rotates by %cl take 2 cycles, not 1.
asm ("roll %2,%0" //box
:"=r" (word)
:"0" (word),"i" (i));
else //box
asm ("roll %%cl,%0"
:"=r" (word)
:"0" (word),"c" (i));
return word;
}
static inline unsigned ROR(unsigned word, int i)
static ALWAYS_INLINE unsigned ROR(unsigned word, int i)
{
if (__builtin_constant_p(i)) //box
asm ("rorl %2,%0" //box
:"=r" (word)
:"0" (word),"i" (i));
else //box
asm ("rorl %%cl,%0"
:"=r" (word)
:"0" (word),"c" (i));