mpn/alpha/com_n.asm


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165

dnl  Alpha mpn_com_n -- mpn one's complement.

dnl  Copyright 2003 Free Software Foundation, Inc.
dnl
dnl  This file is part of the GNU MP Library.
dnl
dnl  The GNU MP Library is free software; you can redistribute it and/or
dnl  modify it under the terms of the GNU Lesser General Public License as
dnl  published by the Free Software Foundation; either version 3 of the
dnl  License, or (at your option) any later version.
dnl
dnl  The GNU MP Library is distributed in the hope that it will be useful,
dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
dnl  Lesser General Public License for more details.
dnl
dnl  You should have received a copy of the GNU Lesser General Public License
dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.

include(`../config.m4')


C      cycles/limb
C EV4:    4.75
C EV5:    2.0
C EV6:    1.5


C mp_limb_t mpn_com_n (mp_ptr dst, mp_srcptr src, mp_size_t size);
C
C For ev5 the main loop is 7 cycles plus 1 taken branch bubble, for a total
C 2.0 c/l.  In general, a pattern like this unrolled to N limbs per loop
C will be 1.5+2/N c/l.
C
C 2 cycles of loop control are unavoidable, for pointer updates and the
C taken branch bubble, but also since ldq cannot issue two cycles after stq
C (and with a run of stqs that means neither of two cycles at the end of the
C loop.
C
C The fbeq is forced into the second cycle of the loop using unops, since
C the first time through it must wait for the cvtqt result.  Once that
C result is ready (a 1 cycle stall) then both the branch and following loads
C can issue together.
C
C The main loop handles an odd count of limbs, being two limbs loaded before
C each size test, plus one pipelined around from the previous iteration (or
C setup in the entry sequence).
C
C An even number of limbs is handled by an explicit dst[0]=~src[0] in the
C entry sequence, and an increment of the pointers.  For an odd size there's
C no increment and the first store in the loop (r24) is a repeat of dst[0].
C
C Note that the load for r24 after the possible pointer increment is done
C before the explicit store to dst[0], in case src==dst.


ASM_START()

FLOAT64(L(dat), 2.0)

	ALIGN(16)

PROLOGUE(mpn_com_n,gp)

	C r16	dst
	C r17	src
	C r18	size

	lda	r30, -16(r30)		C temporary stack space
	lda	r7, -3(r18)		C size - 3

	ldq	r20, 0(r17)		C src[0]
	srl	r7, 1, r6		C (size-3)/2

	stq	r6, 8(r30)		C (size-3)/2
	and	r7, 1, r5		C 1 if size even

	LEA(	r8, L(dat))
	s8addq	r5, r17, r17		C skip src[0] if even

	ornot	r31, r20, r20		C ~src[0]
	unop

	ldt	f0, 8(r30)		C (size-3)/2
	ldq	r24, 0(r17)		C src[0 or 1]

	stq	r20, 0(r16)		C dst[0]
	s8addq	r5, r16, r19		C skip dst[0] if even

	ldt	f1, 0(r8)		C data 2.0
	lda	r30, 16(r30)		C restore stack
	unop
	cvtqt	f0, f0			C (size-3)/2 as float

	ornot	r31, r24, r24
	blt	r7, L(done_1)		C if size<=2
	unop
	unop


	C 16-byte alignment here
L(top):
	C r17	src, incrementing
	C r19	dst, incrementing
	C r24	dst[i] result, ready to store
	C f0	(size-3)/2, decrementing
	C f1	2.0

	ldq	r20, 8(r17)		C src[i+1]
	ldq	r21, 16(r17)		C src[i+2]
	unop
	unop

	fbeq	f0, L(done_2)
	unop
	ldq	r22, 24(r17)		C src[i+3]
	ldq	r23, 32(r17)		C src[i+4]

	stq	r24, 0(r19)		C dst[i]
	ornot	r31, r20, r20
	subt	f0, f1, f0		C count -= 2
	unop

	stq	r20, 8(r19)		C dst[i+1]
	ornot	r31, r21, r21
	unop
	unop

	stq	r21, 16(r19)		C dst[i+2]
	ornot	r31, r22, r22

	stq	r22, 24(r19)		C dst[i+3]
	ornot	r31, r23, r24

	lda	r17, 32(r17)		C src += 4
	lda	r19, 32(r19)		C dst += 4
	unop
	fbge	f0, L(top)


L(done_1):
	C r19	&dst[size-1]
	C r24	result for dst[size-1]

	stq	r24, 0(r19)		C dst[size-1]
	ret	r31, (r26), 1


L(done_2):
	C r19	&dst[size-3]
	C r20	src[size-2]
	C r21	src[size-1]
	C r24	result for dst[size-3]

	stq	r24, 0(r19)		C dst[size-3]
	ornot	r31, r20, r20

	stq	r20, 8(r19)		C dst[size-2]
	ornot	r31, r21, r21

	stq	r21, 16(r19)		C dst[size-1]
	ret	r31, (r26), 1

EPILOGUE()
ASM_END()