1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
|
C nettle, low-level cryptographics library
C
C Copyright (C) 2012 Niels Möller
C
C The nettle library is free software; you can redistribute it and/or modify
C it under the terms of the GNU Lesser General Public License as published by
C the Free Software Foundation; either version 2.1 of the License, or (at your
C option) any later version.
C
C The nettle library is distributed in the hope that it will be useful, but
C WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
C or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
C License for more details.
C
C You should have received a copy of the GNU Lesser General Public License
C along with the nettle library; see the file COPYING.LIB. If not, write to
C the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
C MA 02111-1301, USA.
define(<CTX>, <%rdi>) C 25 64-bit values, 200 bytes.
define(<COUNT>, <%r8>) C Avoid clobbering %rsi, for W64.
define(<A00>, <%rax>)
define(<A0102>, <%xmm0>)
define(<A0304>, <%xmm1>)
define(<A05>, <%rcx>)
define(<A0607>, <%xmm2>)
define(<A0809>, <%xmm3>)
define(<A10>, <%rdx>)
define(<A1112>, <%xmm4>)
define(<A1314>, <%xmm5>)
define(<A15>, <%rbp>)
define(<A1617>, <%xmm6>)
define(<A1819>, <%xmm7>)
define(<A20>, <%r9>)
define(<A2122>, <%xmm8>)
define(<A2324>, <%xmm9>)
define(<C0>, <%r10>)
define(<C12>, <%xmm10>)
define(<C34>, <%xmm11>)
define(<D0>, <%r11>)
define(<D12>, <%xmm12>)
define(<D34>, <%xmm13>)
C Wide temporaries
define(<W0>, <%xmm14>)
define(<W1>, <%xmm15>)
define(<W2>, <%xmm12>) C Overlap D12
define(<W3>, <%xmm13>) C Overlap D34
define(<T0>, <%r12>)
define(<T1>, <%r13>)
define(<T2>, <%r11>) C Overlap D0
define(<T3>, <%r10>) C Overlap C0
define(<RC>, <%r14>)
define(<OFFSET>, <ifelse($1,0,,eval(8*$1))>)
define(<STATE>, <OFFSET($1)(CTX)>)
define(<SWAP64>, <pshufd <$>0x4e,>)
define(<DIRECT_MOVQ>, <no>)
C MOVQ(src, dst), for moves between a general register and an xmm
C register.
ifelse(DIRECT_MOVQ, yes, <
C movq calls that are equal to the corresponding movd,
C where the Apple assembler requires them to be written as movd.
define(<MOVQ>, <movd $1, $2>)
>, <
C Moving via (cached) memory is generally faster.
define(<MOVQ>, <
movq $1, (CTX)
movq (CTX), $2
>)>)
C ROTL64(rot, register, temp)
C Caller needs to or together the result.
define(<ROTL64>, <
movdqa $2, $3
psllq <$>$1, $2
psrlq <$>eval(64-$1), $3
>)
.file "sha3-permute.asm"
C sha3_permute(struct sha3_state *ctx)
.text
ALIGN(16)
PROLOGUE(nettle_sha3_permute)
W64_ENTRY(1, 16)
push %rbp
push %r12
push %r13
push %r14
movl $24, XREG(COUNT)
lea .rc-8(%rip), RC
movq STATE(0), A00
movups STATE(1), A0102
movups STATE(3), A0304
movq A00, C0
movq STATE(5), A05
movdqa A0102, C12
movups STATE(6), A0607
movdqa A0304, C34
movups STATE(8), A0809
xorq A05, C0
movq STATE(10), A10
pxor A0607, C12
movups STATE(11), A1112
pxor A0809, C34
movups STATE(13), A1314
xorq A10, C0
movq STATE(15), A15
pxor A1112, C12
movups STATE(16), A1617
pxor A1314, C34
movups STATE(18), A1819
xorq A15, C0
movq STATE(20), A20
pxor A1617, C12
movups STATE(21), A2122
pxor A1819, C34
movups STATE(23), A2324
xorq A20, C0
pxor A2122, C12
pxor A2324, C34
ALIGN(16)
.Loop:
C The theta step. Combine parity bits, then xor to state.
C D0 = C4 ^ (C1 <<< 1)
C D1 = C0 ^ (C2 <<< 1)
C D2 = C1 ^ (C3 <<< 1)
C D3 = C2 ^ (C4 <<< 1)
C D4 = C3 ^ (C0 <<< 1)
C Shift the words around, putting (C0, C1) in D12, (C2, C3) in
C D34, and (C4, C0) in C34.
C Notes on "unpack" instructions:
C punpckhqdq 01, 23 gives 31
C punpcklqdq 01, 23 gives 20
SWAP64 C34, C34 C Holds C4, C3
movdqa C12, D34
MOVQ(C0, D12)
punpcklqdq C12, D12 C Holds C0, C1
punpckhqdq C34, D34 C Holds C2, C3
punpcklqdq D12, C34 C Holds C4, C0
MOVQ(C34, D0)
MOVQ(C12, T0)
rolq $1, T0
xorq T0, D0
C Can use C12 as temporary
movdqa D34, W0
movdqa D34, W1
psllq $1, W0
psrlq $63, W1
pxor W0, D12
pxor W1, D12 C Done D12
movdqa C34, C12
psrlq $63, C34
psllq $1, C12
pxor C34, D34
pxor C12, D34 C Done D34
xorq D0, A00
xorq D0, A05
xorq D0, A10
xorq D0, A15
xorq D0, A20
pxor D12, A0102
pxor D12, A0607
pxor D12, A1112
pxor D12, A1617
pxor D12, A2122
pxor D34, A0304
pxor D34, A0809
pxor D34, A1314
pxor D34, A1819
pxor D34, A2324
C theta step done, no C, D or W temporaries alive.
C rho and pi steps. When doing the permutations, also
C transpose the matrix.
C The combined permutation + transpose gives the following
C cycles (rotation counts in parenthesis)
C 0 <- 0(0)
C 1 <- 3(28) <- 4(27) <- 2(62) <- 1(1)
C 5 <- 6(44) <- 9(20) <- 8(55) <- 5(36)
C 7 <- 7(6)
C 10 <- 12(43) <- 13(25) <- 11(10) <- 10(3)
C 14 <- 14(39)
C 15 <- 18(21) <- 17(15) <- 19(8) <- 15(41)
C 16 <- 16(45)
C 20 <- 24(14) <- 21(2) <- 22(61) <- 20(18)
C 23 <- 23(56)
C Do the 1,2,3,4 row. First rotate, then permute.
movdqa A0102, W0
movdqa A0102, W1
movdqa A0102, W2
psllq $1, A0102
psrlq $63, W0
psllq $62, W1
por A0102, W0 C rotl 1 (A01)
psrlq $2, W2
por W1, W2 C rotl 62 (A02)
movdqa A0304, A0102
movdqa A0304, W1
psllq $28, A0102
psrlq $36, W1
por W1, A0102 C rotl 28 (A03)
movdqa A0304, W1
psllq $27, A0304
psrlq $37, W1
por W1, A0304 C rotl 27 (A04)
punpcklqdq W0, A0102
punpckhqdq W2, A0304
C 5 <- 6(44) <- 9(20) <- 8(55) <- 5(36)
C 7 <- 7(6)
C __ _______
C _ L' ` L_ __`
C |5| |6|7| |8|9|
C `-_________-^`-^
rolq $36, A05
MOVQ(A05, W0)
MOVQ(A0607, A05)
rolq $44, A05 C Done A05
ROTL64(6, A0607, W1)
por A0607, W1
movdqa A0809, A0607
ROTL64(20, A0607, W2)
por W2, A0607
punpckhqdq W1, A0607 C Done A0607
ROTL64(55, A0809, W1)
por A0809, W1
movdqa W0, A0809
punpcklqdq W1, A0809 C Done 0809
C 10 <- 12(43) <- 13(25) <- 11(10) <- 10(3)
C 14 <- 14(39)
C _____ ___
C __L' __`_L_ `_____
C |10| |11|12| |13|14|
C `-___-^`-______-^
C
rolq $42, A10 C 42 + 25 = 3 (mod 64)
SWAP64 A1112, W0
MOVQ(A10, A1112)
MOVQ(W0, A10)
rolq $43, A10 C Done A10
punpcklqdq A1314, A1112
ROTL64(25, A1112, W1)
por W1, A1112 C Done A1112
ROTL64(39, A1314, W2)
por A1314, W2
ROTL64(10, W0, A1314)
por W0, A1314
punpckhqdq W2, A1314 C Done A1314
C 15 <- 18(21) <- 17(15) <- 19(8) <- 15(41)
C 16 <- 16(45)
C _____________
C / _______
C _L' ____L' | `_
C |15| |16|17| |18|19|
C \ `_____-^ ^
C \_________________/
SWAP64 A1819, W0
rolq $41, A15
MOVQ(A15, W1)
MOVQ(A1819, A15)
rolq $21, A15 C Done A15
SWAP64 A1617, A1819
ROTL64(45, A1617, W2)
por W2, A1617
ROTL64(8, W0, W3)
por W3, W0
punpcklqdq W0, A1617 C Done A1617
ROTL64(15, A1819, W2)
por W2, A1819
punpcklqdq W1, A1819 C Done A1819
C 20 <- 24(14) <- 21(2) <- 22(61) <- 20(18)
C 23 <- 23(56)
C _______________
C / \
C _L' _L'\_ ___`_
C |20| |21|22| |23|24|
C \ `__ ^________-^
C \_______/
rolq $18, A20
MOVQ(A20, W0)
SWAP64 A2324, W1
movd W1, A20
rolq $14, A20 C Done A20
ROTL64(56, A2324, W1)
por W1, A2324
movdqa A2122, W2
ROTL64(2, W2, W1)
por W1, W2
punpcklqdq W2, A2324 C Done A2324
ROTL64(61, A2122, W1)
por W1, A2122
psrldq $8, A2122
punpcklqdq W0, A2122 C Done A2122
C chi step. With the transposed matrix, applied independently
C to each column.
movq A05, T0
notq T0
andq A10, T0
movq A10, T1
notq T1
andq A15, T1
movq A15, T2
notq T2
andq A20, T2
xorq T2, A10
movq A20, T3
notq T3
andq A00, T3
xorq T3, A15
movq A00, T2
notq T2
andq A05, T2
xorq T2, A20
xorq T0, A00
xorq T1, A05
movdqa A0607, W0
pandn A1112, W0
movdqa A1112, W1
pandn A1617, W1
movdqa A1617, W2
pandn A2122, W2
pxor W2, A1112
movdqa A2122, W3
pandn A0102, W3
pxor W3, A1617
movdqa A0102, W2
pandn A0607, W2
pxor W2, A2122
pxor W0, A0102
pxor W1, A0607
movdqa A0809, W0
pandn A1314, W0
movdqa A1314, W1
pandn A1819, W1
movdqa A1819, W2
pandn A2324, W2
pxor W2, A1314
movdqa A2324, W3
pandn A0304, W3
pxor W3, A1819
movdqa A0304, W2
pandn A0809, W2
pxor W2, A2324
pxor W0, A0304
pxor W1, A0809
xorq (RC, COUNT, 8), A00
C Transpose.
C Swap (A05, A10) <-> A0102, and (A15, A20) <-> A0304,
C and also copy to C12 and C34 while at it.
MOVQ(A05, C12)
MOVQ(A15, C34)
MOVQ(A10, W0)
MOVQ(A20, W1)
movq A00, C0
punpcklqdq W0, C12
punpcklqdq W1, C34
MOVQ(A0102, A05)
MOVQ(A0304, A15)
psrldq $8, A0102
psrldq $8, A0304
xorq A05, C0
xorq A15, C0
MOVQ(A0102, A10)
MOVQ(A0304, A20)
movdqa C12, A0102
movdqa C34, A0304
C Transpose (A0607, A1112)
movdqa A0607, W0
punpcklqdq A1112, A0607
xorq A10, C0
xorq A20, C0
punpckhqdq W0, A1112
SWAP64 A1112, A1112
C Transpose (A1819, A2324)
movdqa A1819, W0
punpcklqdq A2324, A1819
pxor A0607, C12
pxor A1112, C12
punpckhqdq W0, A2324
SWAP64 A2324, A2324
C Transpose (A0809, A1314) and (A1617, A2122), and swap
movdqa A0809, W0
movdqa A1314, W1
movdqa A1617, A0809
movdqa A2122, A1314
pxor A1819, C34
pxor A2324, C34
punpcklqdq A2122, A0809
punpckhqdq A1617, A1314
SWAP64 A1314, A1314
movdqa W0, A1617
movdqa W1, A2122
pxor A0809, C34
pxor A1314, C34
punpcklqdq W1, A1617
punpckhqdq W0, A2122
SWAP64 A2122, A2122
decl XREG(COUNT)
pxor A1617, C12
pxor A2122, C12
jnz .Loop
movq A00, STATE(0)
movups A0102, STATE(1)
movups A0304, STATE(3)
movq A05, STATE(5)
movups A0607, STATE(6)
movups A0809, STATE(8)
movq A10, STATE(10)
movups A1112, STATE(11)
movups A1314, STATE(13)
movq A15, STATE(15)
movups A1617, STATE(16)
movups A1819, STATE(18)
movq A20, STATE(20)
movups A2122, STATE(21)
movups A2324, STATE(23)
pop %r14
pop %r13
pop %r12
pop %rbp
W64_EXIT(1, 16)
ret
EPILOGUE(nettle_sha3_permute)
ALIGN(16)
.rc: C In reverse order
.quad 0x8000000080008008
.quad 0x0000000080000001
.quad 0x8000000000008080
.quad 0x8000000080008081
.quad 0x800000008000000A
.quad 0x000000000000800A
.quad 0x8000000000000080
.quad 0x8000000000008002
.quad 0x8000000000008003
.quad 0x8000000000008089
.quad 0x800000000000008B
.quad 0x000000008000808B
.quad 0x000000008000000A
.quad 0x0000000080008009
.quad 0x0000000000000088
.quad 0x000000000000008A
.quad 0x8000000000008009
.quad 0x8000000080008081
.quad 0x0000000080000001
.quad 0x000000000000808B
.quad 0x8000000080008000
.quad 0x800000000000808A
.quad 0x0000000000008082
.quad 0x0000000000000001
|