summaryrefslogtreecommitdiff
path: root/mpn
diff options
context:
space:
mode:
authorKevin Ryde <user42@zip.com.au>2002-04-20 01:39:32 +0200
committerKevin Ryde <user42@zip.com.au>2002-04-20 01:39:32 +0200
commit6a4a0248035755bf2db5a1e51d454b5e66e76165 (patch)
tree02c6ffd53d93675c79a97ffac55846a39cabab02 /mpn
parente6039823ff3d83dc2d8e01329bfe602bdb17964d (diff)
downloadgmp-6a4a0248035755bf2db5a1e51d454b5e66e76165.tar.gz
* mpn/x86/k7/mmx/divrem_1.asm, mpn/x86/p6/mmx/divrem_1.asm
(mpn_preinv_divrem_1): New entrypoint. (mpn_divrem_1): Avoid a branch when testing high<divisor.
Diffstat (limited to 'mpn')
-rw-r--r--mpn/x86/k7/mmx/divrem_1.asm154
1 files changed, 122 insertions, 32 deletions
diff --git a/mpn/x86/k7/mmx/divrem_1.asm b/mpn/x86/k7/mmx/divrem_1.asm
index 161ae4643..685195d34 100644
--- a/mpn/x86/k7/mmx/divrem_1.asm
+++ b/mpn/x86/k7/mmx/divrem_1.asm
@@ -1,19 +1,20 @@
-dnl AMD K7 mpn_divrem_1 -- mpn by limb division.
+dnl AMD K7 mpn_divrem_1, mpn_divrem_1c, mpn_preinv_divrem_1 -- mpn by limb
+dnl division.
dnl Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc.
-dnl
+dnl
dnl This file is part of the GNU MP Library.
-dnl
+dnl
dnl The GNU MP Library is free software; you can redistribute it and/or
dnl modify it under the terms of the GNU Lesser General Public License as
dnl published by the Free Software Foundation; either version 2.1 of the
dnl License, or (at your option) any later version.
-dnl
+dnl
dnl The GNU MP Library is distributed in the hope that it will be useful,
dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
dnl Lesser General Public License for more details.
-dnl
+dnl
dnl You should have received a copy of the GNU Lesser General Public
dnl License along with the GNU MP Library; see the file COPYING.LIB. If
dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
@@ -31,6 +32,10 @@ C mp_limb_t divisor);
C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize,
C mp_srcptr src, mp_size_t size,
C mp_limb_t divisor, mp_limb_t carry);
+C mp_limb_t mpn_preinv_divrem_1 (mp_ptr dst, mp_size_t xsize,
+C mp_srcptr src, mp_size_t size,
+C mp_limb_t divisor, mp_limb_t inverse,
+C unsigned shift);
C
C The method and nomenclature follow part 8 of "Division by Invariant
C Integers using Multiplication" by Granlund and Montgomery, reference in
@@ -41,12 +46,25 @@ C for m', and "d" for d_norm, which won't cause any confusion since it's
C only the normalized divisor that's of any use in the code. "b" is written
C for 2^N, the size of a limb, N being 32 here.
C
-C mpn_divrem_1 avoids one division if the src high limb is less than the
-C divisor. mpn_divrem_1c doesn't check for a zero carry, since in normal
-C circumstances that will be a very rare event.
+C mpn_divrem_1 and mpn_preinv_divrem_1 avoid one division if the src high
+C limb is less than the divisor. mpn_divrem_1c doesn't check for a zero
+C carry, since in normal circumstances that will be a very rare event.
+C
+C The test for skipping a division is branch free (once size>=1 is tested).
+C The store to the destination high limb is 0 when a divide is skipped, or
+C if it's not skipped then a copy of the src high limb is used. The latter
+C is in case src==dst.
C
C There's a small bias towards expecting xsize==0, by having code for
C xsize==0 in a straight line and xsize!=0 under forward jumps.
+C
+C Alternatives:
+C
+C If the divisor is normalized (high bit set) then a division step can
+C always be skipped, since the high destination limb is always 0 or 1 in
+C that case. It doesn't seem worth checking for this though, since it
+C probably occurs infrequently, in particular note that big_base for a
+C decimal mpn_get_str is not normalized in a 32-bit limb.
dnl MUL_THRESHOLD is the value of xsize+size at which the multiply by
@@ -61,7 +79,9 @@ dnl even more so on the fractional part.
deflit(MUL_THRESHOLD, 3)
-defframe(PARAM_CARRY, 24)
+defframe(PARAM_PREINV_SHIFT, 28) dnl mpn_preinv_divrem_1
+defframe(PARAM_PREINV_INVERSE, 24) dnl mpn_preinv_divrem_1
+defframe(PARAM_CARRY, 24) dnl mpn_divrem_1c
defframe(PARAM_DIVISOR,20)
defframe(PARAM_SIZE, 16)
defframe(PARAM_SRC, 12)
@@ -84,6 +104,61 @@ deflit(STACK_SPACE, 36)
TEXT
ALIGN(32)
+PROLOGUE(mpn_preinv_divrem_1)
+deflit(`FRAME',0)
+ movl PARAM_XSIZE, %ecx
+ movl PARAM_DST, %edx
+ subl $STACK_SPACE, %esp FRAME_subl_esp(STACK_SPACE)
+
+ movl %esi, SAVE_ESI
+ movl PARAM_SRC, %esi
+
+ movl %ebx, SAVE_EBX
+ movl PARAM_SIZE, %ebx
+
+ leal 8(%edx,%ecx,4), %edx C &dst[xsize+2]
+ movl %ebp, SAVE_EBP
+ movl PARAM_DIVISOR, %ebp
+
+ movl %edx, VAR_DST_STOP C &dst[xsize+2]
+ movl %edi, SAVE_EDI
+ xorl %edi, %edi C carry
+
+ movl -4(%esi,%ebx,4), %eax C src high limb
+ xor %ecx, %ecx
+
+ C
+
+ C
+
+ cmpl %ebp, %eax C high cmp divisor
+
+ cmovc( %eax, %edi) C high is carry if high<divisor
+ cmovnc( %eax, %ecx) C 0 if skip div, src high if not
+ C (the latter in case src==dst)
+
+ movl %ecx, -12(%edx,%ebx,4) C dst high limb
+ sbbl $0, %ebx C skip one division if high<divisor
+ movl PARAM_PREINV_SHIFT, %ecx
+
+ leal -8(%edx,%ebx,4), %edx C &dst[xsize+size]
+ movl $32, %eax
+
+ movl %edx, VAR_DST C &dst[xsize+size]
+
+ shll %cl, %ebp C d normalized
+ subl %ecx, %eax
+ movl %ecx, VAR_NORM
+
+ movd %eax, %mm7 C rshift
+ movl PARAM_PREINV_INVERSE, %eax
+ jmp L(start_preinv)
+
+EPILOGUE()
+
+
+ ALIGN(16)
+
PROLOGUE(mpn_divrem_1c)
deflit(`FRAME',0)
movl PARAM_CARRY, %edx
@@ -103,13 +178,13 @@ deflit(`FRAME',STACK_SPACE)
movl %esi, SAVE_ESI
movl PARAM_SRC, %esi
- leal -4(%edi,%ebx,4), %edi
+ leal -4(%edi,%ebx,4), %edi C &dst[xsize-1]
jmp L(start_1c)
EPILOGUE()
- C offset 0x31, close enough to aligned
+ C offset 0xa1, close enough to aligned
PROLOGUE(mpn_divrem_1)
deflit(`FRAME',0)
@@ -118,34 +193,37 @@ deflit(`FRAME',0)
subl $STACK_SPACE, %esp
deflit(`FRAME',STACK_SPACE)
- movl %ebp, SAVE_EBP
- movl PARAM_DIVISOR, %ebp
+ movl %esi, SAVE_ESI
+ movl PARAM_SRC, %esi
movl %ebx, SAVE_EBX
movl PARAM_XSIZE, %ebx
- movl %esi, SAVE_ESI
- movl PARAM_SRC, %esi
- orl %ecx, %ecx
+ movl %ebp, SAVE_EBP
+ movl PARAM_DIVISOR, %ebp
+ orl %ecx, %ecx C size
movl %edi, SAVE_EDI
movl PARAM_DST, %edi
leal -4(%edi,%ebx,4), %edi C &dst[xsize-1]
- jz L(no_skip_div)
+ jz L(no_skip_div) C if size==0
movl -4(%esi,%ecx,4), %eax C src high limb
+ xorl %esi, %esi
+
+ cmpl %ebp, %eax C high cmp divisor
- cmpl %ebp, %eax C one less div if high<divisor
- jnb L(no_skip_div)
+ cmovc( %eax, %edx) C high is carry if high<divisor
+ cmovnc( %eax, %esi) C 0 if skip div, src high if not
- movl $0, (%edi,%ecx,4) C dst high limb
- decl %ecx C size-1
- movl %eax, %edx C src high limb as initial carry
+ movl %esi, (%edi,%ecx,4) C dst high limb
+ sbbl $0, %ecx C size-1 if high<divisor
+ movl PARAM_SRC, %esi C reload
L(no_skip_div):
L(start_1c):
- C eax
+ C eax
C ebx xsize
C ecx size
C edx carry
@@ -238,7 +316,7 @@ L(mul_by_inverse):
bsrl %ebp, %eax C 31-l
- leal 12(%edi), %ebx
+ leal 12(%edi), %ebx C &dst[xsize+2], loop dst stop
leal 4(%edi,%ecx,4), %edi C &dst[xsize+size]
movl %edi, VAR_DST
@@ -265,6 +343,17 @@ L(mul_by_inverse):
divl %ebp C floor (b*(b-d)-1) / d
+L(start_preinv):
+ C eax inverse
+ C ebx size
+ C ecx shift
+ C edx
+ C esi src
+ C edi carry
+ C ebp divisor
+ C
+ C mm7 rshift
+
orl %ebx, %ebx C size
movl %eax, VAR_INVERSE
leal -12(%esi,%ebx,4), %eax C &src[size-3]
@@ -297,11 +386,14 @@ L(start_one):
L(start_zero):
+ C Can be here with xsize==0 if mpn_preinv_divrem_1 had size==1 and
+ C skipped a division.
+
shll %cl, %edi C n2 = carry << l
- movl $0, %esi C n10 = 0
+ movl %edi, %eax C return value for zero_done
+ cmpl $0, PARAM_XSIZE
- C we're here because xsize+size>=MUL_THRESHOLD, so with size==0 then
- C must have xsize!=0
+ je L(zero_done)
jmp L(fraction_some)
@@ -437,7 +529,6 @@ L(integer_two_left):
C edi n2
C ebp divisor
C
- C mm0 src limb, shifted
C mm7 rshift
cmpl $0x80000000, %esi C n1 as 0=c, 1=nc
@@ -498,7 +589,6 @@ L(integer_one_left):
C edi n2
C ebp divisor
C
- C mm0 src limb, shifted
C mm7 rshift
movl VAR_DST_STOP, %ecx
@@ -558,6 +648,7 @@ L(integer_none):
movl %edi, %eax
L(fraction_done):
movl VAR_NORM, %ecx
+L(zero_done):
movl SAVE_EBP, %ebp
movl SAVE_EDI, %edi
@@ -651,11 +742,10 @@ L(fraction_some):
C ebp divisor
movl PARAM_DST, %esi
- movl VAR_DST_STOP, %ecx
+ movl VAR_DST_STOP, %ecx C &dst[xsize+2]
movl %edi, %eax
- subl $8, %ecx
-
+ subl $8, %ecx C &dst[xsize]
jmp L(fraction_entry)