diff options
author | Niels Möller <nisse@lysator.liu.se> | 2015-12-15 23:21:06 +0100 |
---|---|---|
committer | Niels Möller <nisse@lysator.liu.se> | 2015-12-15 23:21:06 +0100 |
commit | fa269b6ad06dd13c901dbd84a12e52b918a09cd7 (patch) | |
tree | 5c60756664b91288f070da1adcb3425c19d0c55f | |
parent | c71d2c9d20eeebb985e3872e4550137209e3ce4d (diff) | |
download | nettle-fa269b6ad06dd13c901dbd84a12e52b918a09cd7.tar.gz |
Fix carry folding bug in x86_64 ecc_384_modp. Problem reported by Hanno Böck.
-rw-r--r-- | ChangeLog | 8 | ||||
-rw-r--r-- | x86_64/ecc-384-modp.asm | 171 |
2 files changed, 84 insertions, 95 deletions
@@ -1,3 +1,11 @@ +2015-12-15 Niels Möller <nisse@lysator.liu.se> + + * x86_64/ecc-384-modp.asm: Fixed carry propagation bug. Problem + reported by Hanno Böck. Simplified the folding to always use + non-negative carry, the old code attempted to add in a carry which + could be either positive or negative, but didn't get that case + right. + 2015-12-10 Niels Möller <nisse@lysator.liu.se> * ecc-256.c (ecc_256_modp): Fixed carry propagation bug. Problem diff --git a/x86_64/ecc-384-modp.asm b/x86_64/ecc-384-modp.asm index de9a5151..8e55393f 100644 --- a/x86_64/ecc-384-modp.asm +++ b/x86_64/ecc-384-modp.asm @@ -1,7 +1,7 @@ C x86_64/ecc-384-modp.asm ifelse(< - Copyright (C) 2013 Niels Möller + Copyright (C) 2013, 2015 Niels Möller This file is part of GNU Nettle. @@ -33,7 +33,7 @@ ifelse(< .file "ecc-384-modp.asm" define(<RP>, <%rsi>) -define(<D4>, <%rax>) +define(<D5>, <%rax>) define(<T0>, <%rbx>) define(<T1>, <%rcx>) define(<T2>, <%rdx>) @@ -48,8 +48,8 @@ define(<H4>, <%r13>) define(<H5>, <%r14>) define(<C2>, <%r15>) define(<C0>, H5) C Overlap -define(<D0>, RP) C Overlap -define(<TMP>, H4) C Overlap +define(<TMP>, RP) C Overlap + PROLOGUE(nettle_ecc_384_modp) W64_ENTRY(2, 0) @@ -61,34 +61,38 @@ PROLOGUE(nettle_ecc_384_modp) push %r14 push %r15 - C First get top 2 limbs, which need folding twice + C First get top 2 limbs, which need folding twice. + C B^10 = B^6 + B^4 + 2^32 (B-1)B^4. + C We handle the terms as follow: C - C H5 H4 - C -H5 - C ------ - C H0 D4 + C B^6: Folded immediatly. C - C Then shift right, (H1,H0,D4) <-- (H0,D4) << 32 - C and add + C B^4: Delayed, added in in the next folding. C - C H5 H4 - C H1 H0 - C ---------- - C C2 H1 H0 - - mov 80(RP), D4 - mov 88(RP), H0 - mov D4, H4 - mov H0, H5 - sub H0, D4 - sbb $0, H0 - - mov D4, T2 - mov H0, H1 - shl $32, H0 - shr $32, T2 + C 2^32(B-1) B^4: Low half limb delayed until the next + C folding. Top 1.5 limbs subtracted and shifter now, resulting + C in 2.5 limbs. The low limb saved in D5, high 1.5 limbs added + C in. + + mov 80(RP), H4 + mov 88(RP), H5 + C Shift right 32 bits, into H1, H0 + mov H4, H0 + mov H5, H1 + mov H5, D5 shr $32, H1 - or T2, H0 + shl $32, D5 + shr $32, H0 + or D5, H0 + + C H1 H0 + C - H1 H0 + C -------- + C H1 H0 D5 + mov H0, D5 + neg D5 + sbb H1, H0 + sbb $0, H1 xor C2, C2 add H4, H0 @@ -127,118 +131,95 @@ PROLOGUE(nettle_ecc_384_modp) adc H3, T5 adc $0, C0 - C H3 H2 H1 H0 0 - C - H4 H3 H2 H1 H0 - C --------------- - C H3 H2 H1 H0 D0 - - mov XREG(D4), XREG(D4) - mov H0, D0 - neg D0 - sbb H1, H0 - sbb H2, H1 - sbb H3, H2 - sbb H4, H3 - sbb $0, D4 - - C Shift right. High bits are sign, to be added to C0. - mov D4, TMP - sar $32, TMP - shl $32, D4 - add TMP, C0 - + C Shift left, including low half of H4 mov H3, TMP + shl $32, H4 shr $32, TMP - shl $32, H3 - or TMP, D4 + or TMP, H4 mov H2, TMP + shl $32, H3 shr $32, TMP - shl $32, H2 or TMP, H3 mov H1, TMP + shl $32, H2 shr $32, TMP - shl $32, H1 or TMP, H2 mov H0, TMP + shl $32, H1 shr $32, TMP - shl $32, H0 or TMP, H1 - mov D0, TMP - shr $32, TMP - shl $32, D0 - or TMP, H0 + shl $32, H0 + + C H4 H3 H2 H1 H0 0 + C - H4 H3 H2 H1 H0 + C --------------- + C H4 H3 H2 H1 H0 TMP - add D0, T0 + mov H0, TMP + neg TMP + sbb H1, H0 + sbb H2, H1 + sbb H3, H2 + sbb H4, H3 + sbb $0, H4 + + add TMP, T0 adc H0, T1 adc H1, T2 adc H2, T3 adc H3, T4 - adc D4, T5 + adc H4, T5 adc $0, C0 C Remains to add in C2 and C0 - C C0 C0<<32 (-2^32+1)C0 - C C2 C2<<32 (-2^32+1)C2 - C where C2 is always positive, while C0 may be -1. + C Set H1, H0 = (2^96 - 2^32 + 1) C0 mov C0, H0 mov C0, H1 - mov C0, H2 - sar $63, C0 C Get sign shl $32, H1 - sub H1, H0 C Gives borrow iff C0 > 0 + sub H1, H0 sbb $0, H1 - add C0, H2 + C Set H3, H2 = (2^96 - 2^32 + 1) C2 + mov C2, H2 + mov C2, H3 + shl $32, H3 + sub H3, H2 + sbb $0, H3 + add C0, H2 C No carry. Could use lea trick + + xor C0, C0 add H0, T0 adc H1, T1 - adc $0, H2 - adc $0, C0 - - C Set (H1 H0) <-- C2 << 96 - C2 << 32 + 1 - mov C2, H0 - mov C2, H1 - shl $32, H1 - sub H1, H0 - sbb $0, H1 - - add H2, H0 - adc C0, H1 - adc C2, C0 - mov C0, H2 - sar $63, C0 - add H0, T2 - adc H1, T3 - adc H2, T4 - adc C0, T5 - sbb C0, C0 + adc H2, T2 + adc H3, T3 + adc C2, T4 + adc D5, T5 C Value delayed from initial folding + adc $0, C0 C Use sbb and switch sign? C Final unlikely carry mov C0, H0 mov C0, H1 - mov C0, H2 - sar $63, C0 shl $32, H1 sub H1, H0 sbb $0, H1 - add C0, H2 pop RP - sub H0, T0 + add H0, T0 mov T0, (RP) - sbb H1, T1 + adc H1, T1 mov T1, 8(RP) - sbb H2, T2 + adc C0, T2 mov T2, 16(RP) - sbb C0, T3 + adc $0, T3 mov T3, 24(RP) - sbb C0, T4 + adc $0, T4 mov T4, 32(RP) - sbb C0, T5 + adc $0, T5 mov T5, 40(RP) pop %r15 |