summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNiels Möller <nisse@lysator.liu.se>2022-01-25 18:47:39 +0000
committerNiels Möller <nisse@lysator.liu.se>2022-01-25 18:47:39 +0000
commit6a9e40a0fbf8828218bb183a338ad227ca2c8c43 (patch)
treea39f69ba499ed33606c1199cbbdb5c98fdd0ad10
parent1d4a985c0aa04cf84d34b060bc523e9c5f0a83ba (diff)
parent39af7b2e22d215366f6dcde4d9e74254bc7919e6 (diff)
downloadnettle-6a9e40a0fbf8828218bb183a338ad227ca2c8c43.tar.gz
Merge branch 'arm64-chacha' into 'master'
[Arm64] Optimize Chacha20 See merge request nettle/nettle!37
-rw-r--r--arm64/chacha-2core.asm231
-rw-r--r--arm64/chacha-4core.asm228
-rw-r--r--arm64/chacha-core-internal.asm126
3 files changed, 585 insertions, 0 deletions
diff --git a/arm64/chacha-2core.asm b/arm64/chacha-2core.asm
new file mode 100644
index 00000000..e68c5364
--- /dev/null
+++ b/arm64/chacha-2core.asm
@@ -0,0 +1,231 @@
+C arm64/chacha-2core.asm
+
+ifelse(`
+ Copyright (C) 2020 Niels Möller and Torbjörn Granlund
+ Copyright (C) 2022 Mamone Tarsha
+ This file is part of GNU Nettle.
+
+ GNU Nettle is free software: you can redistribute it and/or
+ modify it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+ or
+
+ * the GNU General Public License as published by the Free
+ Software Foundation; either version 2 of the License, or (at your
+ option) any later version.
+
+ or both in parallel, as here.
+
+ GNU Nettle is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received copies of the GNU General Public License and
+ the GNU Lesser General Public License along with this program. If
+ not, see http://www.gnu.org/licenses/.
+')
+
+C Register usage:
+
+C Argments
+define(`DST', `x0')
+define(`SRC', `x1')
+define(`ROUNDS', `x2')
+
+C Working state
+
+define(`ROT24', `v0')
+
+define(`T0', `v16')
+
+C State, even elements in X, odd elements in Y
+define(`X0', `v17')
+define(`X1', `v18')
+define(`X2', `v19')
+define(`X3', `v20')
+define(`Y0', `v21')
+define(`Y1', `v22')
+define(`Y2', `v23')
+define(`Y3', `v24')
+
+C Original input state
+define(`S0', `v25')
+define(`S1', `v26')
+define(`S2', `v27')
+define(`S3', `v28')
+define(`S3p1', `v29')
+
+define(`TMP0', `v30')
+define(`TMP1', `v31')
+
+ C _chacha_2core(uint32_t *dst, const uint32_t *src, unsigned rounds)
+PROLOGUE(_nettle_chacha_2core)
+
+ eor X1.16b, X1.16b, X1.16b
+ mov w3, #1
+ mov X1.s[0], w3
+
+ add x3, SRC, #48
+ ld1 {X3.4s}, [x3]
+
+ add Y3.4s, X3.4s, X1.4s
+ cmhi Y3.4s, X3.4s, Y3.4s
+ ext Y3.16b, Y3.16b, Y3.16b, #12
+ orr Y3.16b, Y3.16b, X1.16b
+
+.Lshared_entry:
+ adr x3, .Lrot24
+ ld1 {ROT24.4s},[x3]
+
+ add Y3.4s, Y3.4s, X3.4s
+
+C Load state
+ ld1 {X0.4s,X1.4s,X2.4s}, [SRC]
+
+ mov S0.16b, X0.16b
+ mov S1.16b, X1.16b
+ mov S2.16b, X2.16b
+ mov S3.16b, X3.16b
+ mov S3p1.16b, Y3.16b
+
+ trn2 Y0.4s, X0.4s, X0.4s C 1 1 3 3
+ trn1 X0.4s, X0.4s, X0.4s C 0 0 2 2
+ trn2 Y1.4s, X1.4s, X1.4s C 5 5 7 7
+ trn1 X1.4s, X1.4s, X1.4s C 4 4 6 6
+ trn2 Y2.4s, X2.4s, X2.4s C 9 9 11 11
+ trn1 X2.4s, X2.4s, X2.4s C 8 8 10 10
+ trn2 Y3.4s, X3.4s, S3p1.4s C 13 13 15 15
+ trn1 X3.4s, X3.4s, S3p1.4s C 12 12 14 14
+
+.Loop:
+C Register layout (A is first block, B is second block)
+C
+C X0: A0 B0 A2 B2 Y0: A1 B1 A3 B3
+C X1: A4 B4 A6 B6 Y1: A5 B5 A7 B7
+C X2: A8 B8 A10 B10 Y2: A9 B9 A11 B11
+C X3: A12 B12 A14 B14 Y3: A13 B13 A15 B15
+ add X0.4s, X0.4s, X1.4s
+ add Y0.4s, Y0.4s, Y1.4s
+ eor X3.16b, X3.16b, X0.16b
+ eor Y3.16b, Y3.16b, Y0.16b
+ rev32 X3.8h, X3.8h
+ rev32 Y3.8h, Y3.8h
+
+ add X2.4s, X2.4s, X3.4s
+ add Y2.4s, Y2.4s, Y3.4s
+ eor TMP0.16b, X1.16b, X2.16b
+ eor TMP1.16b, Y1.16b, Y2.16b
+ ushr X1.4s, TMP0.4s, #20
+ ushr Y1.4s, TMP1.4s, #20
+ sli X1.4s, TMP0.4s, #12
+ sli Y1.4s, TMP1.4s, #12
+
+ add X0.4s, X0.4s, X1.4s
+ add Y0.4s, Y0.4s, Y1.4s
+ eor X3.16b, X3.16b, X0.16b
+ eor Y3.16b, Y3.16b, Y0.16b
+ tbl X3.16b, {X3.16b}, ROT24.16b
+ tbl Y3.16b, {Y3.16b}, ROT24.16b
+
+ add X2.4s, X2.4s, X3.4s
+ add Y2.4s, Y2.4s, Y3.4s
+ eor TMP0.16b, X1.16b, X2.16b
+ eor TMP1.16b, Y1.16b, Y2.16b
+ ushr X1.4s, TMP0.4s, #25
+ ushr Y1.4s, TMP1.4s, #25
+ sli X1.4s, TMP0.4s, #7
+ sli Y1.4s, TMP1.4s, #7
+
+ ext X1.16b, X1.16b, X1.16b, #8
+ ext X2.16b, X2.16b, X2.16b, #8
+ ext Y2.16b, Y2.16b, Y2.16b, #8
+ ext Y3.16b, Y3.16b, Y3.16b, #8
+
+C Register layout:
+C X0: A0 B0 A2 B2 Y0: A1 B1 A3 B3
+C Y1: A5 B5 A7 B7 X1: A6 B6 A4 B4 (X1 swapped)
+C X2: A10 B10 A8 B8 Y2: A11 A11 A9 B9 (X2, Y2 swapped)
+C Y3 A15 B15 A13 B13 X3 A12 B12 A14 B14 (Y3 swapped)
+
+ add X0.4s, X0.4s, Y1.4s
+ add Y0.4s, Y0.4s, X1.4s
+ eor Y3.16b, Y3.16b, X0.16b
+ eor X3.16b, X3.16b, Y0.16b
+ rev32 Y3.8h, Y3.8h
+ rev32 X3.8h, X3.8h
+
+ add X2.4s, X2.4s, Y3.4s
+ add Y2.4s, Y2.4s, X3.4s
+ eor TMP0.16b, Y1.16b, X2.16b
+ eor TMP1.16b, X1.16b, Y2.16b
+ ushr Y1.4s, TMP0.4s, #20
+ ushr X1.4s, TMP1.4s, #20
+ sli Y1.4s, TMP0.4s, #12
+ sli X1.4s, TMP1.4s, #12
+
+ add X0.4s, X0.4s, Y1.4s
+ add Y0.4s, Y0.4s, X1.4s
+ eor Y3.16b, Y3.16b, X0.16b
+ eor X3.16b, X3.16b, Y0.16b
+ tbl Y3.16b, {Y3.16b}, ROT24.16b
+ tbl X3.16b, {X3.16b}, ROT24.16b
+
+ add X2.4s, X2.4s, Y3.4s
+ add Y2.4s, Y2.4s, X3.4s
+ eor TMP0.16b, Y1.16b, X2.16b
+ eor TMP1.16b, X1.16b, Y2.16b
+ ushr Y1.4s, TMP0.4s, #25
+ ushr X1.4s, TMP1.4s, #25
+ sli Y1.4s, TMP0.4s, #7
+ sli X1.4s, TMP1.4s, #7
+
+ ext X1.16b, X1.16b, X1.16b, #8
+ ext X2.16b, X2.16b, X2.16b, #8
+ ext Y2.16b, Y2.16b, Y2.16b, #8
+ ext Y3.16b, Y3.16b, Y3.16b, #8
+
+ subs ROUNDS, ROUNDS, #2
+ b.ne .Loop
+
+ trn1 T0.4s, X0.4s, Y0.4s
+ trn2 Y0.4s, X0.4s, Y0.4s
+
+ trn1 X0.4s, X1.4s, Y1.4s
+ trn2 Y1.4s, X1.4s, Y1.4s
+
+ trn1 X1.4s, X2.4s, Y2.4s
+ trn2 Y2.4s, X2.4s, Y2.4s
+
+ trn1 X2.4s, X3.4s, Y3.4s
+ trn2 Y3.4s, X3.4s, Y3.4s
+
+ add T0.4s, T0.4s, S0.4s
+ add Y0.4s, Y0.4s, S0.4s
+ add X0.4s, X0.4s, S1.4s
+ add Y1.4s, Y1.4s, S1.4s
+ add X1.4s, X1.4s, S2.4s
+ add Y2.4s, Y2.4s, S2.4s
+ add X2.4s, X2.4s, S3.4s
+ add Y3.4s, Y3.4s, S3p1.4s
+
+ st1 {T0.16b,X0.16b,X1.16b,X2.16b}, [DST], #64
+ st1 {Y0.16b,Y1.16b,Y2.16b,Y3.16b}, [DST]
+ ret
+EPILOGUE(_nettle_chacha_2core)
+
+PROLOGUE(_nettle_chacha_2core32)
+ eor Y3.16b, Y3.16b, Y3.16b C {0,0,...,0}
+ mov w3, #1
+ mov Y3.s[0], w3 C {1,0,...,0}
+ add x3, SRC, #48
+ ld1 {X3.4s}, [x3]
+ b .Lshared_entry
+EPILOGUE(_nettle_chacha_2core32)
+
+.align 4
+.Lrot24: .long 0x02010003,0x06050407,0x0a09080b,0x0e0d0c0f
diff --git a/arm64/chacha-4core.asm b/arm64/chacha-4core.asm
new file mode 100644
index 00000000..b4306ca9
--- /dev/null
+++ b/arm64/chacha-4core.asm
@@ -0,0 +1,228 @@
+C arm64/chacha-4core.asm
+
+ifelse(`
+ Copyright (C) 2020 Niels Möller and Torbjörn Granlund
+ Copyright (C) 2022 Mamone Tarsha
+ This file is part of GNU Nettle.
+
+ GNU Nettle is free software: you can redistribute it and/or
+ modify it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+ or
+
+ * the GNU General Public License as published by the Free
+ Software Foundation; either version 2 of the License, or (at your
+ option) any later version.
+
+ or both in parallel, as here.
+
+ GNU Nettle is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received copies of the GNU General Public License and
+ the GNU Lesser General Public License along with this program. If
+ not, see http://www.gnu.org/licenses/.
+')
+
+C Register usage:
+
+C Argments
+define(`DST', `x0')
+define(`SRC', `x1')
+define(`ROUNDS', `x2')
+
+C Working state
+
+C During the loop, used to save the original values for last 4 words
+C of each block. Also used as temporaries for transpose.
+define(`T0', `v0')
+define(`T1', `v1')
+define(`T2', `v2')
+define(`T3', `v3')
+
+define(`TMP0', `v4')
+define(`TMP1', `v5')
+define(`TMP2', `v6')
+define(`TMP3', `v7')
+
+define(`ROT24', `v8')
+
+C Main loop for round
+define(`QR',`
+ add $1.4s, $1.4s, $2.4s
+ add $5.4s, $5.4s, $6.4s
+ add $9.4s, $9.4s, $10.4s
+ add $13.4s, $13.4s, $14.4s
+ eor $4.16b, $4.16b, $1.16b
+ eor $8.16b, $8.16b, $5.16b
+ eor $12.16b, $12.16b, $9.16b
+ eor $16.16b, $16.16b, $13.16b
+ rev32 $4.8h, $4.8h
+ rev32 $8.8h, $8.8h
+ rev32 $12.8h, $12.8h
+ rev32 $16.8h, $16.8h
+
+ add $3.4s, $3.4s, $4.4s
+ add $7.4s, $7.4s, $8.4s
+ add $11.4s, $11.4s, $12.4s
+ add $15.4s, $15.4s, $16.4s
+ eor TMP0.16b, $2.16b, $3.16b
+ eor TMP1.16b, $6.16b, $7.16b
+ eor TMP2.16b, $10.16b, $11.16b
+ eor TMP3.16b, $14.16b, $15.16b
+ ushr $2.4s, TMP0.4s, #20
+ ushr $6.4s, TMP1.4s, #20
+ ushr $10.4s, TMP2.4s, #20
+ ushr $14.4s, TMP3.4s, #20
+ sli $2.4s, TMP0.4s, #12
+ sli $6.4s, TMP1.4s, #12
+ sli $10.4s, TMP2.4s, #12
+ sli $14.4s, TMP3.4s, #12
+
+ add $1.4s, $1.4s, $2.4s
+ add $5.4s, $5.4s, $6.4s
+ add $9.4s, $9.4s, $10.4s
+ add $13.4s, $13.4s, $14.4s
+ eor $4.16b, $4.16b, $1.16b
+ eor $8.16b, $8.16b, $5.16b
+ eor $12.16b, $12.16b, $9.16b
+ eor $16.16b, $16.16b, $13.16b
+ tbl $4.16b, {$4.16b}, ROT24.16b
+ tbl $8.16b, {$8.16b}, ROT24.16b
+ tbl $12.16b, {$12.16b}, ROT24.16b
+ tbl $16.16b, {$16.16b}, ROT24.16b
+
+ add $3.4s, $3.4s, $4.4s
+ add $7.4s, $7.4s, $8.4s
+ add $11.4s, $11.4s, $12.4s
+ add $15.4s, $15.4s, $16.4s
+ eor TMP0.16b, $2.16b, $3.16b
+ eor TMP1.16b, $6.16b, $7.16b
+ eor TMP2.16b, $10.16b, $11.16b
+ eor TMP3.16b, $14.16b, $15.16b
+ ushr $2.4s, TMP0.4s, #25
+ ushr $6.4s, TMP1.4s, #25
+ ushr $10.4s, TMP2.4s, #25
+ ushr $14.4s, TMP3.4s, #25
+ sli $2.4s, TMP0.4s, #7
+ sli $6.4s, TMP1.4s, #7
+ sli $10.4s, TMP2.4s, #7
+ sli $14.4s, TMP3.4s, #7
+')
+
+define(`TRANSPOSE',`
+ zip1 T0.4s, $1.4s, $3.4s C A0 A2 B0 B2
+ zip1 T1.4s, $2.4s, $4.4s C A1 A3 B1 B3
+ zip2 T2.4s, $1.4s, $3.4s C C0 C2 D0 D2
+ zip2 T3.4s, $2.4s, $4.4s C C1 C3 D1 D3
+
+ zip1 $1.4s, T0.4s, T1.4s C A0 A1 A2 A3
+ zip2 $2.4s, T0.4s, T1.4s C B0 B1 B2 B3
+ zip1 $3.4s, T2.4s, T3.4s C C0 C2 C1 C3
+ zip2 $4.4s, T2.4s, T3.4s C D0 D1 D2 D3
+')
+
+ C _chacha_4core(uint32_t *dst, const uint32_t *src, unsigned rounds)
+PROLOGUE(_nettle_chacha_4core)
+
+ mov w3, #1
+ dup TMP2.4s, w3 C Apply counter carries
+
+.Lshared_entry:
+
+ C Save callee-save registers
+ fmov x3, d8
+
+ adr x4, .Lcnts
+ ld1 {TMP3.4s,ROT24.4s},[x4]
+
+C Load state and splat
+ ld1 {v16.4s,v17.4s,v18.4s,v19.4s}, [SRC]
+
+ dup v20.4s, v16.s[1]
+ dup v24.4s, v16.s[2]
+ dup v28.4s, v16.s[3]
+ dup v16.4s, v16.s[0]
+ dup v21.4s, v17.s[1]
+ dup v25.4s, v17.s[2]
+ dup v29.4s, v17.s[3]
+ dup v17.4s, v17.s[0]
+ dup v22.4s, v18.s[1]
+ dup v26.4s, v18.s[2]
+ dup v30.4s, v18.s[3]
+ dup v18.4s, v18.s[0]
+ dup v23.4s, v19.s[1]
+ dup v27.4s, v19.s[2]
+ dup v31.4s, v19.s[3]
+ dup v19.4s, v19.s[0]
+
+ add v19.4s, v19.4s, TMP3.4s C low adds
+ cmhi TMP1.4s, TMP3.4s, v19.4s C compute carry-out
+ and TMP1.16b, TMP1.16b, TMP2.16b C discard carries for 32-bit counter variant
+ add v23.4s, v23.4s, TMP1.4s C apply carries
+
+ C Save all 4x4 of the last words.
+ mov T0.16b, v19.16b
+ mov T1.16b, v23.16b
+ mov T2.16b, v27.16b
+ mov T3.16b, v31.16b
+
+.Loop:
+ QR(v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31)
+ QR(v16, v21, v26, v31, v20, v25, v30, v19, v24, v29, v18, v23, v28, v17, v22, v27)
+ subs ROUNDS, ROUNDS, #2
+ b.ne .Loop
+
+ C Add in saved original words, including counters, before
+ C transpose.
+ add v19.4s, v19.4s, T0.4s
+ add v23.4s, v23.4s, T1.4s
+ add v27.4s, v27.4s, T2.4s
+ add v31.4s, v31.4s, T3.4s
+
+ TRANSPOSE(v16, v20,v24, v28)
+ TRANSPOSE(v17, v21, v25, v29)
+ TRANSPOSE(v18, v22, v26, v30)
+ TRANSPOSE(v19, v23, v27, v31)
+
+ ld1 {T0.4s,T1.4s,T2.4s}, [SRC]
+
+ add v16.4s, v16.4s, T0.4s
+ add v20.4s, v20.4s, T0.4s
+ add v24.4s, v24.4s, T0.4s
+ add v28.4s, v28.4s, T0.4s
+
+ add v17.4s, v17.4s, T1.4s
+ add v21.4s, v21.4s, T1.4s
+ add v25.4s, v25.4s, T1.4s
+ add v29.4s, v29.4s, T1.4s
+
+ add v18.4s, v18.4s, T2.4s
+ add v22.4s, v22.4s, T2.4s
+ add v26.4s, v26.4s, T2.4s
+ add v30.4s, v30.4s, T2.4s
+
+ st1 {v16.16b,v17.16b,v18.16b,v19.16b}, [DST], #64
+ st1 {v20.16b,v21.16b,v22.16b,v23.16b}, [DST], #64
+ st1 {v24.16b,v25.16b,v26.16b,v27.16b}, [DST], #64
+ st1 {v28.16b,v29.16b,v30.16b,v31.16b}, [DST]
+
+ C Restore callee-save registers
+ fmov d8, x3
+ ret
+EPILOGUE(_nettle_chacha_4core)
+
+PROLOGUE(_nettle_chacha_4core32)
+ eor TMP2.16b, TMP2.16b, TMP2.16b C Ignore counter carries
+ b .Lshared_entry
+EPILOGUE(_nettle_chacha_4core32)
+
+.align 4
+.Lcnts: .long 0,1,2,3 C increments
+.Lrot24: .long 0x02010003,0x06050407,0x0a09080b,0x0e0d0c0f
diff --git a/arm64/chacha-core-internal.asm b/arm64/chacha-core-internal.asm
new file mode 100644
index 00000000..9b70e0dc
--- /dev/null
+++ b/arm64/chacha-core-internal.asm
@@ -0,0 +1,126 @@
+C arm64/chacha-core-internal.asm
+
+ifelse(`
+ Copyright (C) 2020 Niels Möller and Torbjörn Granlund
+ Copyright (C) 2022 Mamone Tarsha
+ This file is part of GNU Nettle.
+
+ GNU Nettle is free software: you can redistribute it and/or
+ modify it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+ or
+
+ * the GNU General Public License as published by the Free
+ Software Foundation; either version 2 of the License, or (at your
+ option) any later version.
+
+ or both in parallel, as here.
+
+ GNU Nettle is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received copies of the GNU General Public License and
+ the GNU Lesser General Public License along with this program. If
+ not, see http://www.gnu.org/licenses/.
+')
+
+C Register usage:
+
+C Argments
+define(`DST', `x0')
+define(`SRC', `x1')
+define(`ROUNDS', `x2')
+
+C Working state
+define(`X0', `v0')
+define(`X1', `v1')
+define(`X2', `v2')
+define(`X3', `v3')
+
+C Original input state
+define(`S0', `v4')
+define(`S1', `v5')
+define(`S2', `v6')
+define(`S3', `v7')
+
+define(`ROT24', `v16')
+
+define(`TMP', `v17')
+
+C QROUND(X0, X1, X2, X3)
+define(`QROUND', `
+ C x0 += x1, x3 ^= x0, x3 lrot 16
+ C x2 += x3, x1 ^= x2, x1 lrot 12
+ C x0 += x1, x3 ^= x0, x3 lrot 8
+ C x2 += x3, x1 ^= x2, x1 lrot 7
+
+ add $1.4s, $1.4s, $2.4s
+ eor $4.16b, $4.16b, $1.16b
+ rev32 $4.8h, $4.8h
+
+ add $3.4s, $3.4s, $4.4s
+ eor TMP.16b, $2.16b, $3.16b
+ ushr $2.4s, TMP.4s, #20
+ sli $2.4s, TMP.4s, #12
+
+ add $1.4s, $1.4s, $2.4s
+ eor $4.16b, $4.16b, $1.16b
+ tbl $4.16b, {$4.16b}, ROT24.16b
+
+ add $3.4s, $3.4s, $4.4s
+ eor TMP.16b, $2.16b, $3.16b
+ ushr $2.4s, TMP.4s, #25
+ sli $2.4s, TMP.4s, #7
+')
+
+ .text
+ C _chacha_core(uint32_t *dst, const uint32_t *src, unsigned rounds)
+PROLOGUE(_nettle_chacha_core)
+ adr x3, .Lrot24
+ ld1 {ROT24.4s},[x3]
+
+ ld1 {X0.4s,X1.4s,X2.4s,X3.4s}, [SRC]
+
+ mov S0.16b, X0.16b
+ mov S1.16b, X1.16b
+ mov S2.16b, X2.16b
+ mov S3.16b, X3.16b
+
+.Loop:
+ QROUND(X0, X1, X2, X3)
+ C Rotate rows, to get
+ C 0 1 2 3
+ C 5 6 7 4 <<< 1
+ C 10 11 8 9 <<< 2
+ C 15 12 13 14 <<< 3
+
+ ext X1.16b, X1.16b, X1.16b, #4
+ ext X2.16b, X2.16b, X2.16b, #8
+ ext X3.16b, X3.16b, X3.16b, #12
+
+ QROUND(X0, X1, X2, X3)
+
+ ext X1.16b, X1.16b, X1.16b, #12
+ ext X2.16b, X2.16b, X2.16b, #8
+ ext X3.16b, X3.16b, X3.16b, #4
+
+ subs ROUNDS, ROUNDS, #2
+ b.ne .Loop
+
+ add X0.4s, X0.4s, S0.4s
+ add X1.4s, X1.4s, S1.4s
+ add X2.4s, X2.4s, S2.4s
+ add X3.4s, X3.4s, S3.4s
+
+ st1 {X0.16b,X1.16b,X2.16b,X3.16b}, [DST]
+ ret
+EPILOGUE(_nettle_chacha_core)
+
+.align 4
+.Lrot24: .long 0x02010003,0x06050407,0x0a09080b,0x0e0d0c0f