diff options
author | Andreas Jaeger <aj@suse.de> | 2002-08-31 17:45:33 +0000 |
---|---|---|
committer | Andreas Jaeger <aj@suse.de> | 2002-08-31 17:45:33 +0000 |
commit | 78df0fcb80247ca7573a8ed07cc992b7031674c1 (patch) | |
tree | 339b581727c7ae5782f5118bf98567acda987553 /sysdeps/x86_64/memset.S | |
parent | 7c9466bc7688e084cfbf9311eb91bdbaed1ea888 (diff) | |
download | glibc-78df0fcb80247ca7573a8ed07cc992b7031674c1.tar.gz |
Update.
* sysdeps/x86_64/dl-machine.h (elf_machine_runtime_setup): Declare
external functions with hidden attribute.
(elf_machine_rela): Optimize.
* sysdeps/x86_64/memset.S: New file.
* sysdeps/x86_64/bzero.S: New file.
* sysdeps/x86_64/stpcpy.S: New file.
* sysdeps/x86_64/strcat.S: New file.
* sysdeps/x86_64/strchr.S: New file.
* sysdeps/x86_64/strcpy.S: New file.
* sysdeps/x86_64/strcspn.S: New file.
* sysdeps/x86_64/strlen.S: New file.
* sysdeps/x86_64/strpbrk.S: New file.
* sysdeps/x86_64/strspn.S: New file.
* sysdeps/x86_64/strcmp.S: New file.
* sysdeps/x86_64/strtok_r.S: New file.
* sysdeps/x86_64/strtok.S: New file.
* sysdeps/x86_64/memcpy.S: New file.
* sysdeps/x86_64/mempcpy.S: New file.
Diffstat (limited to 'sysdeps/x86_64/memset.S')
-rw-r--r-- | sysdeps/x86_64/memset.S | 131 |
1 files changed, 131 insertions, 0 deletions
diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S new file mode 100644 index 0000000000..b95ca40b2f --- /dev/null +++ b/sysdeps/x86_64/memset.S @@ -0,0 +1,131 @@ +/* memset/bzero -- set memory area to CH/0 + Optimized version for x86-64. + Copyright (C) 2002 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Andreas Jaeger <aj@suse.de>. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include <sysdep.h> +#include "asm-syntax.h" +#include "bp-sym.h" +#include "bp-asm.h" + +/* BEWARE: `#ifdef memset' means that memset is redefined as `bzero' */ +#define BZERO_P (defined memset) + +/* This is somehow experimental and could made dependend on the cache + size. */ +#define LARGE $120000 + + .text +ENTRY (memset) +#if BZERO_P + mov %rsi,%rdx /* Adjust parameter. */ + xorq %rsi,%rsi /* Fill with 0s. */ +#endif + cmp $0x7,%rdx /* Check for small length. */ + mov %rdi,%rcx /* Save ptr as return value. */ + jbe 7f + +#if BZERO_P + mov %rsi,%r8 /* Just copy 0. */ +#else + /* Populate 8 bit data to full 64-bit. */ + movabs $0x0101010101010101,%r8 + movzbl %sil,%eax + imul %rax,%r8 +#endif + test $0x7,%edi /* Check for alignment. */ + je 2f + + .p2align 4 +1: /* Align ptr to 8 byte. */ + mov %sil,(%rcx) + dec %rdx + inc %rcx + test $0x7,%ecx + jne 1b + +2: /* Check for really large regions. */ + mov %rdx,%rax + shr $0x6,%rax + je 4f + cmp LARGE, %rdx + jae 11f + + .p2align 4 +3: /* Copy 64 bytes. */ + mov %r8,(%rcx) + mov %r8,0x8(%rcx) + mov %r8,0x10(%rcx) + mov %r8,0x18(%rcx) + mov %r8,0x20(%rcx) + mov %r8,0x28(%rcx) + mov %r8,0x30(%rcx) + mov %r8,0x38(%rcx) + add $0x40,%rcx + dec %rax + jne 3b + +4: /* Copy final bytes. */ + and $0x3f,%edx + mov %rdx,%rax + shr $0x3,%rax + je 6f + +5: /* First in chunks of 8 bytes. */ + mov %r8,(%rcx) + add $0x8,%rcx + dec %rax + jne 5b +6: + and $0x7,%edx +7: + test %rdx,%rdx + je 9f +8: /* And finally as bytes (up to 7). */ + mov %sil,(%rcx) + inc %rcx + dec %rdx + jne 8b +9: +#if BZERO_P + nop +#else + /* Load result (only if used as memset). */ + mov %rdi,%rax /* start address of destination is result */ +#endif + retq + + .p2align 4 +11: /* Copy 64 bytes without polluting the cache. */ + /* We could use movntdq %xmm0,(%rcx) here to further + speed up for large cases but let's not use XMM registers. */ + movnti %r8,(%rcx) + movnti %r8,0x8(%rcx) + movnti %r8,0x10(%rcx) + movnti %r8,0x18(%rcx) + movnti %r8,0x20(%rcx) + movnti %r8,0x28(%rcx) + movnti %r8,0x30(%rcx) + movnti %r8,0x38(%rcx) + add $0x40,%rcx + dec %rax + jne 11b + jmp 4b + +END (memset) |