summaryrefslogtreecommitdiff
path: root/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S
blob: 335a498a23a9eb03a16f9d71df86426c99bbd8d4 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
/* memcpy with unaliged loads
   Copyright (C) 2013-2016 Free Software Foundation, Inc.
   This file is part of the GNU C Library.

   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.

   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.

   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <http://www.gnu.org/licenses/>.  */

#if IS_IN (libc)

#include <sysdep.h>

#include "asm-syntax.h"


ENTRY(__memcpy_sse2_unaligned)
	movq	%rdi, %rax
	testq	%rdx, %rdx
	je	L(return)
	cmpq	$16, %rdx
	jbe	L(less_16)
	movdqu	(%rsi), %xmm8
	cmpq	$32, %rdx
	movdqu	%xmm8, (%rdi)
	movdqu	-16(%rsi,%rdx), %xmm8
	movdqu	%xmm8, -16(%rdi,%rdx)
	ja	.L31
L(return):
	ret
	.p2align 4,,10
	.p2align 4
.L31:
	movdqu	16(%rsi), %xmm8
	cmpq	$64, %rdx
	movdqu	%xmm8, 16(%rdi)
	movdqu	-32(%rsi,%rdx), %xmm8
	movdqu	%xmm8, -32(%rdi,%rdx)
	jbe	L(return)
	movdqu	32(%rsi), %xmm8
	cmpq	$128, %rdx
	movdqu	%xmm8, 32(%rdi)
	movdqu	-48(%rsi,%rdx), %xmm8
	movdqu	%xmm8, -48(%rdi,%rdx)
	movdqu	48(%rsi), %xmm8
	movdqu	%xmm8, 48(%rdi)
	movdqu	-64(%rsi,%rdx), %xmm8
	movdqu	%xmm8, -64(%rdi,%rdx)
	jbe	L(return)
	leaq	64(%rdi), %rcx
	addq	%rdi, %rdx
	andq	$-64, %rdx
	andq	$-64, %rcx
	movq	%rcx, %r11
	subq	%rdi, %r11
	addq	%r11, %rsi
	cmpq	%rdx, %rcx
	je	L(return)
	movq	%rsi, %r10
	subq	%rcx, %r10
	leaq	16(%r10), %r9
	leaq	32(%r10), %r8
	leaq	48(%r10), %r11
	.p2align 4,,10
	.p2align 4
L(loop):
	movdqu	(%rcx,%r10), %xmm8
	movdqa	%xmm8, (%rcx)
	movdqu	(%rcx,%r9), %xmm8
	movdqa	%xmm8, 16(%rcx)
	movdqu	(%rcx,%r8), %xmm8
	movdqa	%xmm8, 32(%rcx)
	movdqu	(%rcx,%r11), %xmm8
	movdqa	%xmm8, 48(%rcx)
	addq	$64, %rcx
	cmpq	%rcx, %rdx
	jne	L(loop)
	ret
L(less_16):
	testb	$24, %dl
	jne	L(between_9_16)
	testb	$4, %dl
	.p2align 4,,5
	jne	L(between_5_8)
	testq	%rdx, %rdx
	.p2align 4,,2
	je	L(return)
	movzbl	(%rsi), %ecx
	testb	$2, %dl
	movb	%cl, (%rdi)
	je	L(return)
	movzwl	-2(%rsi,%rdx), %ecx
	movw	%cx, -2(%rdi,%rdx)
	ret
L(between_9_16):
	movq	(%rsi), %rcx
	movq	%rcx, (%rdi)
	movq	-8(%rsi,%rdx), %rcx
	movq	%rcx, -8(%rdi,%rdx)
	ret
L(between_5_8):
	movl	(%rsi), %ecx
	movl	%ecx, (%rdi)
	movl	-4(%rsi,%rdx), %ecx
	movl	%ecx, -4(%rdi,%rdx)
	ret
END(__memcpy_sse2_unaligned)

#endif