1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
|
/* Copyright (C) 1996 Free Software Foundation, Inc.
Contributed by Richard Henderson (rth@tamu.edu)
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Library General Public License as
published by the Free Software Foundation; either version 2 of the
License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Library General Public License for more details.
You should have received a copy of the GNU Library General Public
License along with the GNU C Library; see the file COPYING.LIB. If
not, write to the Free Software Foundation, Inc., 675 Mass Ave,
Cambridge, MA 02139, USA. */
/* This is the child of the C-with-inline-assembly memcpy posted by
Martin Ostermann (ost@comnets.rwth-aachen.de).
This is generally scheduled for the EV5, but whenever necessary and
possible, the autoswap slotting feature of the EV5 is used so that the
code lays out nicely for the EV4 as well. */
#include <alpha/regdef.h>
.set noreorder
.text
.ent copy_fwd_aligned
copy_fwd_aligned:
.frame sp, 0, ra, 0
.prologue 0
/* Aligned forward copy main loop. On entry to this basic block:
t0 == source word waiting to be stored
t2 == loop counter
a0 == destination pointer
a1 == source pointer
a2 mod 8 == byte count in final word */
.align 4
$fa_loop:
and t2, 7, t1 # e0 :
beq t1, 1f # .. e1 :
0: stq_u t0, 0(a0) # e0 :
subq t1, 1, t1 # .. e1 :
ldq_u t0, 8(a1) # e0 : copy up to seven words
addq a0, 8, a0 # .. e1 :
addq a1, 8, a1 # e0 :
bne t1, 0b # .. e1 :
1: bic t2, 7, t2 # e0 :
beq t2, $fa_tail # .. e1 :
2: stq_u t0, 0(a0) # e0 :
addq a0, 64, a0 # .. e1 :
ldq_u t3, 8(a1) # e0 : copy eight words as fast as we can
ldq_u t4, 16(a1) # .. e1 :
ldq_u t5, 24(a1) # e0 :
ldq_u t6, 32(a1) # .. e1 :
ldq_u t7, 40(a1) # e0 :
ldq_u t8, 48(a1) # .. e1 :
ldq_u t9, 56(a1) # e0 :
ldq_u t0, 64(a1) # .. e1 :
stq_u t3, -56(a0) # e0 :
subq t2, 8, t2 # .. e1 :
stq_u t4, -48(a0) # e0 :
addq a1, 64, a1 # .. e1 :
stq_u t5, -40(a0) # e0 :
stq_u t6, -32(a0) # e0 :
stq_u t7, -24(a0) # e0 :
stq_u t8, -16(a0) # e0 :
stq_u t9, -8(a0) # e0 :
bne t2, 2b # .. e1 :
/* Take care of a partial word tail. */
$fa_tail:
and a2, 7, t3 # e0 :
bne t3, 1f # .. e1 (zdb)
/* Aligned copy, aligned tail, final store. */
stq_u t0, 0(a0)
ret
1: ldq_u t1, 0(a0) # e1 :
mskql t0, a2, t0 # .. e1 :
mskqh t1, a2, t1 # e0 (stall)
bis t0, t1, t0 # e1 :
stq_u t0, 0(a0) # e0 :
ret # .. e1 :
/* This is the actual entry point to this function. */
.align 3
$fwd_aligned:
ldq_u t0, 0(a1) # e0 :
and a0, 7, t3 # .. e1 :
addq a2, t3, a2 # e0 :
subq a2, 1, t2 # e1 :
sra t2, 3, t2 # e0 :
beq t3, $fa_loop # .. e1 :
ldq_u t1, 0(a0) # e0 :
beq t2, $fa_small # .. e1 :
mskqh t0, a0, t0 # e0 :
mskql t1, a0, t3 # e0 :
bis t0, t3, t0 # e0 :
br $fa_loop # .. e1 :
/* The move affects exactly one destination word. */
$fa_small:
mskqh t0, a0, t0 # e0 :
and a2, 7, t4 # .. e1 :
mskql t1, a0, t3 # e0 :
bne t4, 1f # .. e1 :
or t0, t3, t0 # e0 :
unop # :
stq_u t0, 0(a0) # e0 :
ret # .. e1 :
1: mskql t0, a2, t0 # e0 :
mskqh t1, a2, t1 # e0 :
or t0, t3, t0 # e0 :
or t0, t1, t0 # e1 :
stq_u t0, 0(a0) # e0 :
ret # .. e1 :
.end copy_fwd_aligned
.ent memcpy
.globl memcpy
.align 3
memcpy:
.frame sp, 0, ra, 0
#ifdef PROF
ldgp gp, 0(ra)
lda AT, _mcount
jsr AT, (AT), _mcount
.prologue 1
#else
.prologue 0
#endif
mov a0, v0
beq a2, $zero_length
/* Are source and destination co-aligned? */
xor a0, a1, t0
unop
and t0, 7, t0
beq t0, $fwd_aligned
br $fwd_unaligned
.end memcpy
.ent copy_fwd_unaligned
copy_fwd_unaligned:
.frame sp, 0, ra, 0
.prologue 0
/* Unaligned forward copy main loop. On entry to this basic block:
t0 == source low word, unshifted
t2 == loop counter
t7 == last source byte + 1
a0 == destination pointer
a1 == source pointer
a2 mod 8 == byte count in final word */
.align 4
$fu_loop:
beq t2, $fu_tail # e1 :
blbc t2, 0f # e1 :
ldq_u t1, 8(a1) # e1 : copy one unaligned word
extql t0, a1, t3 # .. e0 :
addq a1, 8, a1 # e0 :
addq a0, 8, a0 # .. e1 :
extqh t1, a1, t4 # e0 :
subq t2, 1, t2 # .. e1 :
mov t1, t0 # e0 :
or t3, t4, t3 # .. e1 :
stq_u t3, -8(a0) # e0 :
beq t2, $fu_tail # .. e1 :
0: ldq_u t1, 8(a1) # e1 : copy two unaligned words
extql t0, a1, t3 # .. e0 :
ldq_u t0, 16(a1) # e0 :
subq t2, 2, t2 # .. e1 :
extqh t1, a1, t4 # e0 :
addq a0, 16, a0 # .. e1 :
extql t1, a1, t5 # e0 :
or t3, t4, t3 # .. e1 :
extqh t0, a1, t6 # e0 :
addq a1, 16, a1 # .. e1 :
stq_u t3, -16(a0) # e0 :
or t5, t6, t5 # .. e1 :
stq_u t5, -8(a0) # e0 :
bne t2, 0b # .. e1 :
/* Take care of a partial words tail. */
$fu_tail:
ldq_u t4, -1(t7) # e1 :
extql t0, a1, t3 # .. e0 :
extqh t4, a1, t4 # e0 (stall)
and a2, 7, t5 # .. e1 :
or t3, t4, t3 # e0 :
beq t5, 1f # .. e1 :
ldq_u t1, 0(a0) # e1 :
mskql t3, a2, t3 # .. e0 :
mskqh t1, a2, t1 # e0 (stall)
or t1, t3, t3 # e1 :
1: stq_u t3, 0(a0) # e0 :
ret # .. e1 :
/* The entry point to the unaligned forward copy. */
.align 3
$fwd_unaligned:
ldq_u t0, 0(a1) # e0 : load initial bits of src
addq a1, a2, t7 # .. e1 : record last byte + 1 of src
and a0, 7, t3 # e0 : find dst misalignment
addq a2, t3, a2 # e1 : find number of words affected
subq a2, 1, t2 # e0 :
cmple a2, 8, t4 # .. e1 : are we dealing with a small block?
subq a1, t3, a1 # e0 :
bne t4, $fu_small # .. e1 :
srl t2, 3, t2 # e0 :
beq t3, $fu_loop # .. e1 :
/* Take care of an unaligned dst head. */
ldq_u t5, 0(a0) # e0 :
ldq_u t1, 8(a1) # .. e1 :
extql t0, a1, t3 # e0 :
addq a0, 8, a0 # .. e1 :
extqh t1, a1, t4 # e0 :
addq a1, 8, a1 # .. e1 :
mskql t5, a0, t5 # e0 :
or t3, t4, t3 # .. e1 :
mskqh t3, a0, t3 # e0 :
subq t2, 1, t2 # .. e1 :
or t3, t5, t3 # e0 :
mov t1, t0 # .. e1 :
stq_u t3, -8(a0) # e0 :
br $fu_loop # .. e1 :
/* The move affects exactly one destination word. */
.align 3
$fu_small:
ldq_u t2, 0(a0) # e1 :
extql t0, a1, t3 # .. e0 :
ldq_u t1, -1(t7) # e0 :
and a2, 7, t8 # .. e1 :
mskqh t2, a2, t6 # e0 :
mskql t2, a0, t5 # e0 :
extqh t1, a1, t4 # e0 :
cmovne t8, t6, t8 # .. e1 :
or t3, t4, t3 # e0 :
or t5, t8, t5 # .. e1 :
mskqh t3, a0, t3 # e0 :
and a2, 7, t8 # .. e1 :
mskql t3, a2, t6 # e0 :
cmovne t8, t6, t8 # e1 :
or t3, t5, t3 # e0 :
unop # :
stq_u t3, 0(a0) # e0 :
$zero_length:
ret # .. e1 :
.end copy_fwd_unaligned
|