1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
|
/* memcpy with unaliged loads
Copyright (C) 2013-2016 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
#if IS_IN (libc)
#include <sysdep.h>
#include "asm-syntax.h"
ENTRY(__memcpy_sse2_unaligned)
movq %rdi, %rax
testq %rdx, %rdx
je L(return)
cmpq $16, %rdx
jbe L(less_16)
movdqu (%rsi), %xmm8
cmpq $32, %rdx
movdqu %xmm8, (%rdi)
movdqu -16(%rsi,%rdx), %xmm8
movdqu %xmm8, -16(%rdi,%rdx)
ja .L31
L(return):
ret
.p2align 4,,10
.p2align 4
.L31:
movdqu 16(%rsi), %xmm8
cmpq $64, %rdx
movdqu %xmm8, 16(%rdi)
movdqu -32(%rsi,%rdx), %xmm8
movdqu %xmm8, -32(%rdi,%rdx)
jbe L(return)
movdqu 32(%rsi), %xmm8
cmpq $128, %rdx
movdqu %xmm8, 32(%rdi)
movdqu -48(%rsi,%rdx), %xmm8
movdqu %xmm8, -48(%rdi,%rdx)
movdqu 48(%rsi), %xmm8
movdqu %xmm8, 48(%rdi)
movdqu -64(%rsi,%rdx), %xmm8
movdqu %xmm8, -64(%rdi,%rdx)
jbe L(return)
leaq 64(%rdi), %rcx
addq %rdi, %rdx
andq $-64, %rdx
andq $-64, %rcx
movq %rcx, %r11
subq %rdi, %r11
addq %r11, %rsi
cmpq %rdx, %rcx
je L(return)
movq %rsi, %r10
subq %rcx, %r10
leaq 16(%r10), %r9
leaq 32(%r10), %r8
leaq 48(%r10), %r11
.p2align 4,,10
.p2align 4
L(loop):
movdqu (%rcx,%r10), %xmm8
movdqa %xmm8, (%rcx)
movdqu (%rcx,%r9), %xmm8
movdqa %xmm8, 16(%rcx)
movdqu (%rcx,%r8), %xmm8
movdqa %xmm8, 32(%rcx)
movdqu (%rcx,%r11), %xmm8
movdqa %xmm8, 48(%rcx)
addq $64, %rcx
cmpq %rcx, %rdx
jne L(loop)
ret
L(less_16):
testb $24, %dl
jne L(between_9_16)
testb $4, %dl
.p2align 4,,5
jne L(between_5_8)
testq %rdx, %rdx
.p2align 4,,2
je L(return)
movzbl (%rsi), %ecx
testb $2, %dl
movb %cl, (%rdi)
je L(return)
movzwl -2(%rsi,%rdx), %ecx
movw %cx, -2(%rdi,%rdx)
ret
L(between_9_16):
movq (%rsi), %rcx
movq %rcx, (%rdi)
movq -8(%rsi,%rdx), %rcx
movq %rcx, -8(%rdi,%rdx)
ret
L(between_5_8):
movl (%rsi), %ecx
movl %ecx, (%rdi)
movl -4(%rsi,%rdx), %ecx
movl %ecx, -4(%rdi,%rdx)
ret
END(__memcpy_sse2_unaligned)
#endif
|