diff options
author | Ondrej Bilka <neleai@seznam.cz> | 2013-03-06 22:27:18 +0100 |
---|---|---|
committer | Ondrej Bilka <neleai@seznam.cz> | 2013-03-06 22:27:18 +0100 |
commit | 87bd9bc4bd2a49a441bb9ba744c9ddb0c9434823 (patch) | |
tree | ddef45a58945fed230d26a155bbc10739b3fa864 /sysdeps/x86_64/strlen.S | |
parent | b79188d71716b6286866e06add976fe84100595e (diff) | |
download | glibc-87bd9bc4bd2a49a441bb9ba744c9ddb0c9434823.tar.gz |
Revert " * sysdeps/x86_64/strlen.S: Replace with new SSE2 based implementation"
This reverts commit b79188d71716b6286866e06add976fe84100595e.
Diffstat (limited to 'sysdeps/x86_64/strlen.S')
-rw-r--r-- | sysdeps/x86_64/strlen.S | 263 |
1 files changed, 65 insertions, 198 deletions
diff --git a/sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S index e82fe8d039..4bdca0a452 100644 --- a/sysdeps/x86_64/strlen.S +++ b/sysdeps/x86_64/strlen.S @@ -1,5 +1,6 @@ -/* SSE2 version of strlen. - Copyright (C) 2012, 2013 Free Software Foundation, Inc. +/* strlen(str) -- determine the length of the string STR. + Copyright (C) 2009-2013 Free Software Foundation, Inc. + Contributed by Ulrich Drepper <drepper@redhat.com>. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -18,217 +19,83 @@ #include <sysdep.h> -/* Long lived register are - strlen(s), strnlen(s, n): - %xmm11 - zero - %rdi - s - %r10 (s+n) & (~(64-1)) - %r11 s+n -*/ - - -.text + .text ENTRY(strlen) - -#define FIND_ZERO \ - pcmpeqb (%rax), %xmm8; \ - pcmpeqb 16(%rax), %xmm9; \ - pcmpeqb 32(%rax), %xmm10; \ - pcmpeqb 48(%rax), %xmm11; \ - pmovmskb %xmm8, %esi; \ - pmovmskb %xmm9, %edx; \ - pmovmskb %xmm10, %r8d; \ - pmovmskb %xmm11, %ecx; \ - salq $16, %rdx; \ - salq $16, %rcx; \ - orq %rsi, %rdx; \ - orq %r8, %rcx; \ - salq $32, %rcx; \ - orq %rcx, %rdx; - -#ifdef AS_STRNLEN -/* Do not read anything when n==0. */ - test %rsi, %rsi - jne L(n_nonzero) xor %rax, %rax - ret -L(n_nonzero): - -/* Initialize long lived registers. */ - - add %rdi, %rsi - mov %rsi, %r10 - and $-64, %r10 - mov %rsi, %r11 -#endif - - pxor %xmm8, %xmm8 - pxor %xmm9, %xmm9 - pxor %xmm10, %xmm10 - pxor %xmm11, %xmm11 - movq %rdi, %rax - movq %rdi, %rcx - andq $4095, %rcx -/* Offsets 4032-4047 will be aligned into 4032 thus fit into page. */ - cmpq $4047, %rcx -/* We cannot unify this branching as it would be ~6 cycles slower. */ + mov %edi, %ecx + and $0x3f, %ecx + pxor %xmm0, %xmm0 + cmp $0x30, %ecx ja L(next) - -#ifdef AS_STRNLEN -# define STRNLEN_PROLOG \ - mov %r11, %rsi; \ - subq %rax, %rsi; \ - andq $-64, %rax; \ - testq $-64, %rsi; \ - je L(strnlen_ret) -#else -# define STRNLEN_PROLOG andq $-64, %rax; -#endif - -#define PROLOG(lab) \ - movq %rdi, %rcx; \ - xorq %rax, %rcx; \ - STRNLEN_PROLOG; \ - sarq %cl, %rdx; \ - test %rdx, %rdx; \ - je L(lab); \ - bsfq %rdx, %rax; \ - ret - -#ifdef AS_STRNLEN - andq $-16, %rax - FIND_ZERO -#else - movdqu (%rax), %xmm12 - pcmpeqb %xmm8, %xmm12 - pmovmskb %xmm12, %edx + movdqu (%rdi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %edx test %edx, %edx - je L(next48_bytes) - bsfq %rdx, %rax - ret - -L(next48_bytes): -/* Same as FIND_ZERO except we do not check first 16 bytes. */ - andq $-16, %rax - pcmpeqb 16(%rax), %xmm9; - pcmpeqb 32(%rax), %xmm10; - pcmpeqb 48(%rax), %xmm11; - pmovmskb %xmm9, %edx; - pmovmskb %xmm10, %r8d; - pmovmskb %xmm11, %ecx; - salq $16, %rdx; - salq $16, %rcx; - orq %r8, %rcx; - salq $32, %rcx; - orq %rcx, %rdx; -#endif - - PROLOG(loop) - - .p2align 4 + jnz L(exit_less16) + mov %rdi, %rax + and $-16, %rax + jmp L(align16_start) L(next): - andq $-64, %rax - FIND_ZERO - PROLOG(loop_init) - -#ifdef AS_STRNLEN -/* We must do this check to correctly handle strnlen (s, -1). */ -L(strnlen_ret): - bts %rsi, %rdx - sarq %cl, %rdx - test %rdx, %rdx - je L(loop_init) - bsfq %rdx, %rax - ret -#endif - .p2align 4 -L(loop_init): - pxor %xmm9, %xmm9 - pxor %xmm10, %xmm10 - pxor %xmm11, %xmm11 -#ifdef AS_STRNLEN + mov %rdi, %rax + and $-16, %rax + pcmpeqb (%rax), %xmm0 + mov $-1, %esi + sub %rax, %rcx + shl %cl, %esi + pmovmskb %xmm0, %edx + and %esi, %edx + jnz L(exit) +L(align16_start): + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 .p2align 4 -L(loop): - - addq $64, %rax - cmpq %rax, %r10 - je L(exit_end) +L(align16_loop): + pcmpeqb 16(%rax), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz L(exit16) - movdqa (%rax), %xmm8 - pminub 16(%rax), %xmm8 - pminub 32(%rax), %xmm8 - pminub 48(%rax), %xmm8 - pcmpeqb %xmm11, %xmm8 - pmovmskb %xmm8, %edx - testl %edx, %edx - jne L(exit) - jmp L(loop) + pcmpeqb 32(%rax), %xmm1 + pmovmskb %xmm1, %edx + test %edx, %edx + jnz L(exit32) - .p2align 4 -L(exit_end): - cmp %rax, %r11 - je L(first) - pxor %xmm8, %xmm8 - FIND_ZERO + pcmpeqb 48(%rax), %xmm2 + pmovmskb %xmm2, %edx + test %edx, %edx + jnz L(exit48) -L(first): - bts %r11, %rdx - bsfq %rdx, %rdx - addq %rdx, %rax - subq %rdi, %rax + pcmpeqb 64(%rax), %xmm3 + pmovmskb %xmm3, %edx + lea 64(%rax), %rax + test %edx, %edx + jz L(align16_loop) +L(exit): + sub %rdi, %rax +L(exit_less16): + bsf %rdx, %rdx + add %rdx, %rax ret - .p2align 4 -L(exit): - pxor %xmm8, %xmm8 - FIND_ZERO - - bsfq %rdx, %rdx - addq %rdx, %rax - subq %rdi, %rax +L(exit16): + sub %rdi, %rax + bsf %rdx, %rdx + lea 16(%rdx,%rax), %rax ret - -#else .p2align 4 -L(loop): - - movdqa 64(%rax), %xmm8 - pminub 80(%rax), %xmm8 - pminub 96(%rax), %xmm8 - pminub 112(%rax), %xmm8 - pcmpeqb %xmm11, %xmm8 - pmovmskb %xmm8, %edx - testl %edx, %edx - jne L(exit64) - - subq $-128, %rax - - movdqa (%rax), %xmm8 - pminub 16(%rax), %xmm8 - pminub 32(%rax), %xmm8 - pminub 48(%rax), %xmm8 - pcmpeqb %xmm11, %xmm8 - pmovmskb %xmm8, %edx - testl %edx, %edx - jne L(exit0) - jmp L(loop) - +L(exit32): + sub %rdi, %rax + bsf %rdx, %rdx + lea 32(%rdx,%rax), %rax + ret .p2align 4 -L(exit64): - addq $64, %rax -L(exit0): - pxor %xmm8, %xmm8 - FIND_ZERO - - bsfq %rdx, %rdx - addq %rdx, %rax - subq %rdi, %rax +L(exit48): + sub %rdi, %rax + bsf %rdx, %rdx + lea 48(%rdx,%rax), %rax ret - -#endif - END(strlen) -#ifndef AS_STRLEN libc_hidden_builtin_def (strlen) -#endif |