diff options
Diffstat (limited to 'nacl/crypto_stream/aes128ctr/portable/common.h')
-rw-r--r-- | nacl/crypto_stream/aes128ctr/portable/common.h | 788 |
1 files changed, 788 insertions, 0 deletions
diff --git a/nacl/crypto_stream/aes128ctr/portable/common.h b/nacl/crypto_stream/aes128ctr/portable/common.h new file mode 100644 index 00000000..0f723332 --- /dev/null +++ b/nacl/crypto_stream/aes128ctr/portable/common.h | |||
@@ -0,0 +1,788 @@ | |||
1 | /* Author: Peter Schwabe, ported from an assembly implementation by Emilia Käsper | ||
2 | Date: 2009-03-19 | ||
3 | Public domain */ | ||
4 | #ifndef COMMON_H | ||
5 | #define COMMON_H | ||
6 | |||
7 | #include "types.h" | ||
8 | |||
9 | #define load32_bigendian crypto_stream_aes128ctr_portable_load32_bigendian | ||
10 | uint32 load32_bigendian(const unsigned char *x); | ||
11 | |||
12 | #define store32_bigendian crypto_stream_aes128ctr_portable_store32_bigendian | ||
13 | void store32_bigendian(unsigned char *x,uint32 u); | ||
14 | |||
15 | #define load32_littleendian crypto_stream_aes128ctr_portable_load32_littleendian | ||
16 | uint32 load32_littleendian(const unsigned char *x); | ||
17 | |||
18 | #define store32_littleendian crypto_stream_aes128ctr_portable_store32_littleendian | ||
19 | void store32_littleendian(unsigned char *x,uint32 u); | ||
20 | |||
21 | #define load64_littleendian crypto_stream_aes128ctr_portable_load64_littleendian | ||
22 | uint64 load64_littleendian(const unsigned char *x); | ||
23 | |||
24 | #define store64_littleendian crypto_stream_aes128ctr_portable_store64_littleendian | ||
25 | void store64_littleendian(unsigned char *x,uint64 u); | ||
26 | |||
27 | /* Macros required only for key expansion */ | ||
28 | |||
29 | #define keyexpbs1(b0, b1, b2, b3, b4, b5, b6, b7, t0, t1, t2, t3, t4, t5, t6, t7, bskey) \ | ||
30 | rotbyte(&b0);\ | ||
31 | rotbyte(&b1);\ | ||
32 | rotbyte(&b2);\ | ||
33 | rotbyte(&b3);\ | ||
34 | rotbyte(&b4);\ | ||
35 | rotbyte(&b5);\ | ||
36 | rotbyte(&b6);\ | ||
37 | rotbyte(&b7);\ | ||
38 | ;\ | ||
39 | sbox(b0, b1, b2, b3, b4, b5, b6, b7, t0, t1, t2, t3, t4, t5, t6, t7);\ | ||
40 | ;\ | ||
41 | xor_rcon(&b0);\ | ||
42 | shufb(&b0, EXPB0);\ | ||
43 | shufb(&b1, EXPB0);\ | ||
44 | shufb(&b4, EXPB0);\ | ||
45 | shufb(&b6, EXPB0);\ | ||
46 | shufb(&b3, EXPB0);\ | ||
47 | shufb(&b7, EXPB0);\ | ||
48 | shufb(&b2, EXPB0);\ | ||
49 | shufb(&b5, EXPB0);\ | ||
50 | shufb(&b0, EXPB0);\ | ||
51 | ;\ | ||
52 | t0 = *(int128 *)(bskey + 0);\ | ||
53 | t1 = *(int128 *)(bskey + 16);\ | ||
54 | t2 = *(int128 *)(bskey + 32);\ | ||
55 | t3 = *(int128 *)(bskey + 48);\ | ||
56 | t4 = *(int128 *)(bskey + 64);\ | ||
57 | t5 = *(int128 *)(bskey + 80);\ | ||
58 | t6 = *(int128 *)(bskey + 96);\ | ||
59 | t7 = *(int128 *)(bskey + 112);\ | ||
60 | ;\ | ||
61 | xor2(&b0, &t0);\ | ||
62 | xor2(&b1, &t1);\ | ||
63 | xor2(&b4, &t2);\ | ||
64 | xor2(&b6, &t3);\ | ||
65 | xor2(&b3, &t4);\ | ||
66 | xor2(&b7, &t5);\ | ||
67 | xor2(&b2, &t6);\ | ||
68 | xor2(&b5, &t7);\ | ||
69 | ;\ | ||
70 | rshift32_littleendian(&t0, 8);\ | ||
71 | rshift32_littleendian(&t1, 8);\ | ||
72 | rshift32_littleendian(&t2, 8);\ | ||
73 | rshift32_littleendian(&t3, 8);\ | ||
74 | rshift32_littleendian(&t4, 8);\ | ||
75 | rshift32_littleendian(&t5, 8);\ | ||
76 | rshift32_littleendian(&t6, 8);\ | ||
77 | rshift32_littleendian(&t7, 8);\ | ||
78 | ;\ | ||
79 | xor2(&b0, &t0);\ | ||
80 | xor2(&b1, &t1);\ | ||
81 | xor2(&b4, &t2);\ | ||
82 | xor2(&b6, &t3);\ | ||
83 | xor2(&b3, &t4);\ | ||
84 | xor2(&b7, &t5);\ | ||
85 | xor2(&b2, &t6);\ | ||
86 | xor2(&b5, &t7);\ | ||
87 | ;\ | ||
88 | rshift32_littleendian(&t0, 8);\ | ||
89 | rshift32_littleendian(&t1, 8);\ | ||
90 | rshift32_littleendian(&t2, 8);\ | ||
91 | rshift32_littleendian(&t3, 8);\ | ||
92 | rshift32_littleendian(&t4, 8);\ | ||
93 | rshift32_littleendian(&t5, 8);\ | ||
94 | rshift32_littleendian(&t6, 8);\ | ||
95 | rshift32_littleendian(&t7, 8);\ | ||
96 | ;\ | ||
97 | xor2(&b0, &t0);\ | ||
98 | xor2(&b1, &t1);\ | ||
99 | xor2(&b4, &t2);\ | ||
100 | xor2(&b6, &t3);\ | ||
101 | xor2(&b3, &t4);\ | ||
102 | xor2(&b7, &t5);\ | ||
103 | xor2(&b2, &t6);\ | ||
104 | xor2(&b5, &t7);\ | ||
105 | ;\ | ||
106 | rshift32_littleendian(&t0, 8);\ | ||
107 | rshift32_littleendian(&t1, 8);\ | ||
108 | rshift32_littleendian(&t2, 8);\ | ||
109 | rshift32_littleendian(&t3, 8);\ | ||
110 | rshift32_littleendian(&t4, 8);\ | ||
111 | rshift32_littleendian(&t5, 8);\ | ||
112 | rshift32_littleendian(&t6, 8);\ | ||
113 | rshift32_littleendian(&t7, 8);\ | ||
114 | ;\ | ||
115 | xor2(&b0, &t0);\ | ||
116 | xor2(&b1, &t1);\ | ||
117 | xor2(&b4, &t2);\ | ||
118 | xor2(&b6, &t3);\ | ||
119 | xor2(&b3, &t4);\ | ||
120 | xor2(&b7, &t5);\ | ||
121 | xor2(&b2, &t6);\ | ||
122 | xor2(&b5, &t7);\ | ||
123 | ;\ | ||
124 | *(int128 *)(bskey + 128) = b0;\ | ||
125 | *(int128 *)(bskey + 144) = b1;\ | ||
126 | *(int128 *)(bskey + 160) = b4;\ | ||
127 | *(int128 *)(bskey + 176) = b6;\ | ||
128 | *(int128 *)(bskey + 192) = b3;\ | ||
129 | *(int128 *)(bskey + 208) = b7;\ | ||
130 | *(int128 *)(bskey + 224) = b2;\ | ||
131 | *(int128 *)(bskey + 240) = b5;\ | ||
132 | |||
133 | #define keyexpbs10(b0, b1, b2, b3, b4, b5, b6, b7, t0, t1, t2, t3, t4, t5, t6, t7, bskey) ;\ | ||
134 | toggle(&b0);\ | ||
135 | toggle(&b1);\ | ||
136 | toggle(&b5);\ | ||
137 | toggle(&b6);\ | ||
138 | rotbyte(&b0);\ | ||
139 | rotbyte(&b1);\ | ||
140 | rotbyte(&b2);\ | ||
141 | rotbyte(&b3);\ | ||
142 | rotbyte(&b4);\ | ||
143 | rotbyte(&b5);\ | ||
144 | rotbyte(&b6);\ | ||
145 | rotbyte(&b7);\ | ||
146 | ;\ | ||
147 | sbox(b0, b1, b2, b3, b4, b5, b6, b7, t0, t1, t2, t3, t4, t5, t6, t7);\ | ||
148 | ;\ | ||
149 | xor_rcon(&b1);\ | ||
150 | xor_rcon(&b4);\ | ||
151 | xor_rcon(&b3);\ | ||
152 | xor_rcon(&b7);\ | ||
153 | shufb(&b0, EXPB0);\ | ||
154 | shufb(&b1, EXPB0);\ | ||
155 | shufb(&b4, EXPB0);\ | ||
156 | shufb(&b6, EXPB0);\ | ||
157 | shufb(&b3, EXPB0);\ | ||
158 | shufb(&b7, EXPB0);\ | ||
159 | shufb(&b2, EXPB0);\ | ||
160 | shufb(&b5, EXPB0);\ | ||
161 | ;\ | ||
162 | t0 = *(int128 *)(bskey + 9 * 128 + 0);\ | ||
163 | t1 = *(int128 *)(bskey + 9 * 128 + 16);\ | ||
164 | t2 = *(int128 *)(bskey + 9 * 128 + 32);\ | ||
165 | t3 = *(int128 *)(bskey + 9 * 128 + 48);\ | ||
166 | t4 = *(int128 *)(bskey + 9 * 128 + 64);\ | ||
167 | t5 = *(int128 *)(bskey + 9 * 128 + 80);\ | ||
168 | t6 = *(int128 *)(bskey + 9 * 128 + 96);\ | ||
169 | t7 = *(int128 *)(bskey + 9 * 128 + 112);\ | ||
170 | ;\ | ||
171 | toggle(&t0);\ | ||
172 | toggle(&t1);\ | ||
173 | toggle(&t5);\ | ||
174 | toggle(&t6);\ | ||
175 | ;\ | ||
176 | xor2(&b0, &t0);\ | ||
177 | xor2(&b1, &t1);\ | ||
178 | xor2(&b4, &t2);\ | ||
179 | xor2(&b6, &t3);\ | ||
180 | xor2(&b3, &t4);\ | ||
181 | xor2(&b7, &t5);\ | ||
182 | xor2(&b2, &t6);\ | ||
183 | xor2(&b5, &t7);\ | ||
184 | ;\ | ||
185 | rshift32_littleendian(&t0, 8);\ | ||
186 | rshift32_littleendian(&t1, 8);\ | ||
187 | rshift32_littleendian(&t2, 8);\ | ||
188 | rshift32_littleendian(&t3, 8);\ | ||
189 | rshift32_littleendian(&t4, 8);\ | ||
190 | rshift32_littleendian(&t5, 8);\ | ||
191 | rshift32_littleendian(&t6, 8);\ | ||
192 | rshift32_littleendian(&t7, 8);\ | ||
193 | ;\ | ||
194 | xor2(&b0, &t0);\ | ||
195 | xor2(&b1, &t1);\ | ||
196 | xor2(&b4, &t2);\ | ||
197 | xor2(&b6, &t3);\ | ||
198 | xor2(&b3, &t4);\ | ||
199 | xor2(&b7, &t5);\ | ||
200 | xor2(&b2, &t6);\ | ||
201 | xor2(&b5, &t7);\ | ||
202 | ;\ | ||
203 | rshift32_littleendian(&t0, 8);\ | ||
204 | rshift32_littleendian(&t1, 8);\ | ||
205 | rshift32_littleendian(&t2, 8);\ | ||
206 | rshift32_littleendian(&t3, 8);\ | ||
207 | rshift32_littleendian(&t4, 8);\ | ||
208 | rshift32_littleendian(&t5, 8);\ | ||
209 | rshift32_littleendian(&t6, 8);\ | ||
210 | rshift32_littleendian(&t7, 8);\ | ||
211 | ;\ | ||
212 | xor2(&b0, &t0);\ | ||
213 | xor2(&b1, &t1);\ | ||
214 | xor2(&b4, &t2);\ | ||
215 | xor2(&b6, &t3);\ | ||
216 | xor2(&b3, &t4);\ | ||
217 | xor2(&b7, &t5);\ | ||
218 | xor2(&b2, &t6);\ | ||
219 | xor2(&b5, &t7);\ | ||
220 | ;\ | ||
221 | rshift32_littleendian(&t0, 8);\ | ||
222 | rshift32_littleendian(&t1, 8);\ | ||
223 | rshift32_littleendian(&t2, 8);\ | ||
224 | rshift32_littleendian(&t3, 8);\ | ||
225 | rshift32_littleendian(&t4, 8);\ | ||
226 | rshift32_littleendian(&t5, 8);\ | ||
227 | rshift32_littleendian(&t6, 8);\ | ||
228 | rshift32_littleendian(&t7, 8);\ | ||
229 | ;\ | ||
230 | xor2(&b0, &t0);\ | ||
231 | xor2(&b1, &t1);\ | ||
232 | xor2(&b4, &t2);\ | ||
233 | xor2(&b6, &t3);\ | ||
234 | xor2(&b3, &t4);\ | ||
235 | xor2(&b7, &t5);\ | ||
236 | xor2(&b2, &t6);\ | ||
237 | xor2(&b5, &t7);\ | ||
238 | ;\ | ||
239 | shufb(&b0, M0);\ | ||
240 | shufb(&b1, M0);\ | ||
241 | shufb(&b2, M0);\ | ||
242 | shufb(&b3, M0);\ | ||
243 | shufb(&b4, M0);\ | ||
244 | shufb(&b5, M0);\ | ||
245 | shufb(&b6, M0);\ | ||
246 | shufb(&b7, M0);\ | ||
247 | ;\ | ||
248 | *(int128 *)(bskey + 1280) = b0;\ | ||
249 | *(int128 *)(bskey + 1296) = b1;\ | ||
250 | *(int128 *)(bskey + 1312) = b4;\ | ||
251 | *(int128 *)(bskey + 1328) = b6;\ | ||
252 | *(int128 *)(bskey + 1344) = b3;\ | ||
253 | *(int128 *)(bskey + 1360) = b7;\ | ||
254 | *(int128 *)(bskey + 1376) = b2;\ | ||
255 | *(int128 *)(bskey + 1392) = b5;\ | ||
256 | |||
257 | |||
258 | #define keyexpbs(b0, b1, b2, b3, b4, b5, b6, b7, t0, t1, t2, t3, t4, t5, t6, t7, rcon, i, bskey) \ | ||
259 | toggle(&b0);\ | ||
260 | toggle(&b1);\ | ||
261 | toggle(&b5);\ | ||
262 | toggle(&b6);\ | ||
263 | rotbyte(&b0);\ | ||
264 | rotbyte(&b1);\ | ||
265 | rotbyte(&b2);\ | ||
266 | rotbyte(&b3);\ | ||
267 | rotbyte(&b4);\ | ||
268 | rotbyte(&b5);\ | ||
269 | rotbyte(&b6);\ | ||
270 | rotbyte(&b7);\ | ||
271 | ;\ | ||
272 | sbox(b0, b1, b2, b3, b4, b5, b6, b7, t0, t1, t2, t3, t4, t5, t6, t7);\ | ||
273 | ;\ | ||
274 | rcon;\ | ||
275 | shufb(&b0, EXPB0);\ | ||
276 | shufb(&b1, EXPB0);\ | ||
277 | shufb(&b4, EXPB0);\ | ||
278 | shufb(&b6, EXPB0);\ | ||
279 | shufb(&b3, EXPB0);\ | ||
280 | shufb(&b7, EXPB0);\ | ||
281 | shufb(&b2, EXPB0);\ | ||
282 | shufb(&b5, EXPB0);\ | ||
283 | ;\ | ||
284 | t0 = *(int128 *)(bskey + (i-1) * 128 + 0);\ | ||
285 | t1 = *(int128 *)(bskey + (i-1) * 128 + 16);\ | ||
286 | t2 = *(int128 *)(bskey + (i-1) * 128 + 32);\ | ||
287 | t3 = *(int128 *)(bskey + (i-1) * 128 + 48);\ | ||
288 | t4 = *(int128 *)(bskey + (i-1) * 128 + 64);\ | ||
289 | t5 = *(int128 *)(bskey + (i-1) * 128 + 80);\ | ||
290 | t6 = *(int128 *)(bskey + (i-1) * 128 + 96);\ | ||
291 | t7 = *(int128 *)(bskey + (i-1) * 128 + 112);\ | ||
292 | ;\ | ||
293 | toggle(&t0);\ | ||
294 | toggle(&t1);\ | ||
295 | toggle(&t5);\ | ||
296 | toggle(&t6);\ | ||
297 | ;\ | ||
298 | xor2(&b0, &t0);\ | ||
299 | xor2(&b1, &t1);\ | ||
300 | xor2(&b4, &t2);\ | ||
301 | xor2(&b6, &t3);\ | ||
302 | xor2(&b3, &t4);\ | ||
303 | xor2(&b7, &t5);\ | ||
304 | xor2(&b2, &t6);\ | ||
305 | xor2(&b5, &t7);\ | ||
306 | ;\ | ||
307 | rshift32_littleendian(&t0, 8);\ | ||
308 | rshift32_littleendian(&t1, 8);\ | ||
309 | rshift32_littleendian(&t2, 8);\ | ||
310 | rshift32_littleendian(&t3, 8);\ | ||
311 | rshift32_littleendian(&t4, 8);\ | ||
312 | rshift32_littleendian(&t5, 8);\ | ||
313 | rshift32_littleendian(&t6, 8);\ | ||
314 | rshift32_littleendian(&t7, 8);\ | ||
315 | ;\ | ||
316 | xor2(&b0, &t0);\ | ||
317 | xor2(&b1, &t1);\ | ||
318 | xor2(&b4, &t2);\ | ||
319 | xor2(&b6, &t3);\ | ||
320 | xor2(&b3, &t4);\ | ||
321 | xor2(&b7, &t5);\ | ||
322 | xor2(&b2, &t6);\ | ||
323 | xor2(&b5, &t7);\ | ||
324 | ;\ | ||
325 | rshift32_littleendian(&t0, 8);\ | ||
326 | rshift32_littleendian(&t1, 8);\ | ||
327 | rshift32_littleendian(&t2, 8);\ | ||
328 | rshift32_littleendian(&t3, 8);\ | ||
329 | rshift32_littleendian(&t4, 8);\ | ||
330 | rshift32_littleendian(&t5, 8);\ | ||
331 | rshift32_littleendian(&t6, 8);\ | ||
332 | rshift32_littleendian(&t7, 8);\ | ||
333 | ;\ | ||
334 | xor2(&b0, &t0);\ | ||
335 | xor2(&b1, &t1);\ | ||
336 | xor2(&b4, &t2);\ | ||
337 | xor2(&b6, &t3);\ | ||
338 | xor2(&b3, &t4);\ | ||
339 | xor2(&b7, &t5);\ | ||
340 | xor2(&b2, &t6);\ | ||
341 | xor2(&b5, &t7);\ | ||
342 | ;\ | ||
343 | rshift32_littleendian(&t0, 8);\ | ||
344 | rshift32_littleendian(&t1, 8);\ | ||
345 | rshift32_littleendian(&t2, 8);\ | ||
346 | rshift32_littleendian(&t3, 8);\ | ||
347 | rshift32_littleendian(&t4, 8);\ | ||
348 | rshift32_littleendian(&t5, 8);\ | ||
349 | rshift32_littleendian(&t6, 8);\ | ||
350 | rshift32_littleendian(&t7, 8);\ | ||
351 | ;\ | ||
352 | xor2(&b0, &t0);\ | ||
353 | xor2(&b1, &t1);\ | ||
354 | xor2(&b4, &t2);\ | ||
355 | xor2(&b6, &t3);\ | ||
356 | xor2(&b3, &t4);\ | ||
357 | xor2(&b7, &t5);\ | ||
358 | xor2(&b2, &t6);\ | ||
359 | xor2(&b5, &t7);\ | ||
360 | ;\ | ||
361 | *(int128 *)(bskey + i*128 + 0) = b0;\ | ||
362 | *(int128 *)(bskey + i*128 + 16) = b1;\ | ||
363 | *(int128 *)(bskey + i*128 + 32) = b4;\ | ||
364 | *(int128 *)(bskey + i*128 + 48) = b6;\ | ||
365 | *(int128 *)(bskey + i*128 + 64) = b3;\ | ||
366 | *(int128 *)(bskey + i*128 + 80) = b7;\ | ||
367 | *(int128 *)(bskey + i*128 + 96) = b2;\ | ||
368 | *(int128 *)(bskey + i*128 + 112) = b5;\ | ||
369 | |||
370 | /* Macros used in multiple contexts */ | ||
371 | |||
372 | #define bitslicekey0(key, bskey) \ | ||
373 | xmm0 = *(int128 *) (key + 0);\ | ||
374 | shufb(&xmm0, M0);\ | ||
375 | copy2(&xmm1, &xmm0);\ | ||
376 | copy2(&xmm2, &xmm0);\ | ||
377 | copy2(&xmm3, &xmm0);\ | ||
378 | copy2(&xmm4, &xmm0);\ | ||
379 | copy2(&xmm5, &xmm0);\ | ||
380 | copy2(&xmm6, &xmm0);\ | ||
381 | copy2(&xmm7, &xmm0);\ | ||
382 | ;\ | ||
383 | bitslice(xmm7, xmm6, xmm5, xmm4, xmm3, xmm2, xmm1, xmm0, t);\ | ||
384 | ;\ | ||
385 | *(int128 *) (bskey + 0) = xmm0;\ | ||
386 | *(int128 *) (bskey + 16) = xmm1;\ | ||
387 | *(int128 *) (bskey + 32) = xmm2;\ | ||
388 | *(int128 *) (bskey + 48) = xmm3;\ | ||
389 | *(int128 *) (bskey + 64) = xmm4;\ | ||
390 | *(int128 *) (bskey + 80) = xmm5;\ | ||
391 | *(int128 *) (bskey + 96) = xmm6;\ | ||
392 | *(int128 *) (bskey + 112) = xmm7;\ | ||
393 | |||
394 | |||
395 | #define bitslicekey10(key, bskey) \ | ||
396 | xmm0 = *(int128 *) (key + 0);\ | ||
397 | copy2(xmm1, xmm0);\ | ||
398 | copy2(xmm2, xmm0);\ | ||
399 | copy2(xmm3, xmm0);\ | ||
400 | copy2(xmm4, xmm0);\ | ||
401 | copy2(xmm5, xmm0);\ | ||
402 | copy2(xmm6, xmm0);\ | ||
403 | copy2(xmm7, xmm0);\ | ||
404 | ;\ | ||
405 | bitslice(xmm7, xmm6, xmm5, xmm4, xmm3, xmm2, xmm1, xmm0, t);\ | ||
406 | ;\ | ||
407 | toggle(&xmm6);\ | ||
408 | toggle(&xmm5);\ | ||
409 | toggle(&xmm1);\ | ||
410 | toggle(&xmm0);\ | ||
411 | ;\ | ||
412 | *(int128 *) (bskey + 0 + 1280) = xmm0;\ | ||
413 | *(int128 *) (bskey + 16 + 1280) = xmm1;\ | ||
414 | *(int128 *) (bskey + 32 + 1280) = xmm2;\ | ||
415 | *(int128 *) (bskey + 48 + 1280) = xmm3;\ | ||
416 | *(int128 *) (bskey + 64 + 1280) = xmm4;\ | ||
417 | *(int128 *) (bskey + 80 + 1280) = xmm5;\ | ||
418 | *(int128 *) (bskey + 96 + 1280) = xmm6;\ | ||
419 | *(int128 *) (bskey + 112 + 1280) = xmm7;\ | ||
420 | |||
421 | |||
422 | #define bitslicekey(i,key,bskey) \ | ||
423 | xmm0 = *(int128 *) (key + 0);\ | ||
424 | shufb(&xmm0, M0);\ | ||
425 | copy2(&xmm1, &xmm0);\ | ||
426 | copy2(&xmm2, &xmm0);\ | ||
427 | copy2(&xmm3, &xmm0);\ | ||
428 | copy2(&xmm4, &xmm0);\ | ||
429 | copy2(&xmm5, &xmm0);\ | ||
430 | copy2(&xmm6, &xmm0);\ | ||
431 | copy2(&xmm7, &xmm0);\ | ||
432 | ;\ | ||
433 | bitslice(xmm7, xmm6, xmm5, xmm4, xmm3, xmm2, xmm1, xmm0, t);\ | ||
434 | ;\ | ||
435 | toggle(&xmm6);\ | ||
436 | toggle(&xmm5);\ | ||
437 | toggle(&xmm1);\ | ||
438 | toggle(&xmm0);\ | ||
439 | ;\ | ||
440 | *(int128 *) (bskey + 0 + 128*i) = xmm0;\ | ||
441 | *(int128 *) (bskey + 16 + 128*i) = xmm1;\ | ||
442 | *(int128 *) (bskey + 32 + 128*i) = xmm2;\ | ||
443 | *(int128 *) (bskey + 48 + 128*i) = xmm3;\ | ||
444 | *(int128 *) (bskey + 64 + 128*i) = xmm4;\ | ||
445 | *(int128 *) (bskey + 80 + 128*i) = xmm5;\ | ||
446 | *(int128 *) (bskey + 96 + 128*i) = xmm6;\ | ||
447 | *(int128 *) (bskey + 112 + 128*i) = xmm7;\ | ||
448 | |||
449 | |||
450 | #define bitslice(x0, x1, x2, x3, x4, x5, x6, x7, t) \ | ||
451 | swapmove(x0, x1, 1, BS0, t);\ | ||
452 | swapmove(x2, x3, 1, BS0, t);\ | ||
453 | swapmove(x4, x5, 1, BS0, t);\ | ||
454 | swapmove(x6, x7, 1, BS0, t);\ | ||
455 | ;\ | ||
456 | swapmove(x0, x2, 2, BS1, t);\ | ||
457 | swapmove(x1, x3, 2, BS1, t);\ | ||
458 | swapmove(x4, x6, 2, BS1, t);\ | ||
459 | swapmove(x5, x7, 2, BS1, t);\ | ||
460 | ;\ | ||
461 | swapmove(x0, x4, 4, BS2, t);\ | ||
462 | swapmove(x1, x5, 4, BS2, t);\ | ||
463 | swapmove(x2, x6, 4, BS2, t);\ | ||
464 | swapmove(x3, x7, 4, BS2, t);\ | ||
465 | |||
466 | |||
467 | #define swapmove(a, b, n, m, t) \ | ||
468 | copy2(&t, &b);\ | ||
469 | rshift64_littleendian(&t, n);\ | ||
470 | xor2(&t, &a);\ | ||
471 | and2(&t, &m);\ | ||
472 | xor2(&a, &t);\ | ||
473 | lshift64_littleendian(&t, n);\ | ||
474 | xor2(&b, &t); | ||
475 | |||
476 | #define rotbyte(x) \ | ||
477 | shufb(x, ROTB) /* TODO: Make faster */ | ||
478 | |||
479 | |||
480 | /* Macros used for encryption (and decryption) */ | ||
481 | |||
482 | #define shiftrows(x0, x1, x2, x3, x4, x5, x6, x7, i, M, bskey) \ | ||
483 | xor2(&x0, (int128 *)(bskey + 128*(i-1) + 0));\ | ||
484 | shufb(&x0, M);\ | ||
485 | xor2(&x1, (int128 *)(bskey + 128*(i-1) + 16));\ | ||
486 | shufb(&x1, M);\ | ||
487 | xor2(&x2, (int128 *)(bskey + 128*(i-1) + 32));\ | ||
488 | shufb(&x2, M);\ | ||
489 | xor2(&x3, (int128 *)(bskey + 128*(i-1) + 48));\ | ||
490 | shufb(&x3, M);\ | ||
491 | xor2(&x4, (int128 *)(bskey + 128*(i-1) + 64));\ | ||
492 | shufb(&x4, M);\ | ||
493 | xor2(&x5, (int128 *)(bskey + 128*(i-1) + 80));\ | ||
494 | shufb(&x5, M);\ | ||
495 | xor2(&x6, (int128 *)(bskey + 128*(i-1) + 96));\ | ||
496 | shufb(&x6, M);\ | ||
497 | xor2(&x7, (int128 *)(bskey + 128*(i-1) + 112));\ | ||
498 | shufb(&x7, M);\ | ||
499 | |||
500 | |||
501 | #define mixcolumns(x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2, t3, t4, t5, t6, t7) \ | ||
502 | shufd(&t0, &x0, 0x93);\ | ||
503 | shufd(&t1, &x1, 0x93);\ | ||
504 | shufd(&t2, &x2, 0x93);\ | ||
505 | shufd(&t3, &x3, 0x93);\ | ||
506 | shufd(&t4, &x4, 0x93);\ | ||
507 | shufd(&t5, &x5, 0x93);\ | ||
508 | shufd(&t6, &x6, 0x93);\ | ||
509 | shufd(&t7, &x7, 0x93);\ | ||
510 | ;\ | ||
511 | xor2(&x0, &t0);\ | ||
512 | xor2(&x1, &t1);\ | ||
513 | xor2(&x2, &t2);\ | ||
514 | xor2(&x3, &t3);\ | ||
515 | xor2(&x4, &t4);\ | ||
516 | xor2(&x5, &t5);\ | ||
517 | xor2(&x6, &t6);\ | ||
518 | xor2(&x7, &t7);\ | ||
519 | ;\ | ||
520 | xor2(&t0, &x7);\ | ||
521 | xor2(&t1, &x0);\ | ||
522 | xor2(&t2, &x1);\ | ||
523 | xor2(&t1, &x7);\ | ||
524 | xor2(&t3, &x2);\ | ||
525 | xor2(&t4, &x3);\ | ||
526 | xor2(&t5, &x4);\ | ||
527 | xor2(&t3, &x7);\ | ||
528 | xor2(&t6, &x5);\ | ||
529 | xor2(&t7, &x6);\ | ||
530 | xor2(&t4, &x7);\ | ||
531 | ;\ | ||
532 | shufd(&x0, &x0, 0x4e);\ | ||
533 | shufd(&x1, &x1, 0x4e);\ | ||
534 | shufd(&x2, &x2, 0x4e);\ | ||
535 | shufd(&x3, &x3, 0x4e);\ | ||
536 | shufd(&x4, &x4, 0x4e);\ | ||
537 | shufd(&x5, &x5, 0x4e);\ | ||
538 | shufd(&x6, &x6, 0x4e);\ | ||
539 | shufd(&x7, &x7, 0x4e);\ | ||
540 | ;\ | ||
541 | xor2(&t0, &x0);\ | ||
542 | xor2(&t1, &x1);\ | ||
543 | xor2(&t2, &x2);\ | ||
544 | xor2(&t3, &x3);\ | ||
545 | xor2(&t4, &x4);\ | ||
546 | xor2(&t5, &x5);\ | ||
547 | xor2(&t6, &x6);\ | ||
548 | xor2(&t7, &x7);\ | ||
549 | |||
550 | |||
551 | #define aesround(i, b0, b1, b2, b3, b4, b5, b6, b7, t0, t1, t2, t3, t4, t5, t6, t7, bskey) \ | ||
552 | shiftrows(b0, b1, b2, b3, b4, b5, b6, b7, i, SR, bskey);\ | ||
553 | sbox(b0, b1, b2, b3, b4, b5, b6, b7, t0, t1, t2, t3, t4, t5, t6, t7);\ | ||
554 | mixcolumns(b0, b1, b4, b6, b3, b7, b2, b5, t0, t1, t2, t3, t4, t5, t6, t7);\ | ||
555 | |||
556 | |||
557 | #define lastround(b0, b1, b2, b3, b4, b5, b6, b7, t0, t1, t2, t3, t4, t5, t6, t7, bskey) \ | ||
558 | shiftrows(b0, b1, b2, b3, b4, b5, b6, b7, 10, SRM0, bskey);\ | ||
559 | sbox(b0, b1, b2, b3, b4, b5, b6, b7, t0, t1, t2, t3, t4, t5, t6, t7);\ | ||
560 | xor2(&b0,(int128 *)(bskey + 128*10));\ | ||
561 | xor2(&b1,(int128 *)(bskey + 128*10+16));\ | ||
562 | xor2(&b4,(int128 *)(bskey + 128*10+32));\ | ||
563 | xor2(&b6,(int128 *)(bskey + 128*10+48));\ | ||
564 | xor2(&b3,(int128 *)(bskey + 128*10+64));\ | ||
565 | xor2(&b7,(int128 *)(bskey + 128*10+80));\ | ||
566 | xor2(&b2,(int128 *)(bskey + 128*10+96));\ | ||
567 | xor2(&b5,(int128 *)(bskey + 128*10+112));\ | ||
568 | |||
569 | |||
570 | #define sbox(b0, b1, b2, b3, b4, b5, b6, b7, t0, t1, t2, t3, s0, s1, s2, s3) \ | ||
571 | InBasisChange(b0, b1, b2, b3, b4, b5, b6, b7); \ | ||
572 | Inv_GF256(b6, b5, b0, b3, b7, b1, b4, b2, t0, t1, t2, t3, s0, s1, s2, s3); \ | ||
573 | OutBasisChange(b7, b1, b4, b2, b6, b5, b0, b3); \ | ||
574 | |||
575 | |||
576 | #define InBasisChange(b0, b1, b2, b3, b4, b5, b6, b7) \ | ||
577 | xor2(&b5, &b6);\ | ||
578 | xor2(&b2, &b1);\ | ||
579 | xor2(&b5, &b0);\ | ||
580 | xor2(&b6, &b2);\ | ||
581 | xor2(&b3, &b0);\ | ||
582 | ;\ | ||
583 | xor2(&b6, &b3);\ | ||
584 | xor2(&b3, &b7);\ | ||
585 | xor2(&b3, &b4);\ | ||
586 | xor2(&b7, &b5);\ | ||
587 | xor2(&b3, &b1);\ | ||
588 | ;\ | ||
589 | xor2(&b4, &b5);\ | ||
590 | xor2(&b2, &b7);\ | ||
591 | xor2(&b1, &b5);\ | ||
592 | |||
593 | #define OutBasisChange(b0, b1, b2, b3, b4, b5, b6, b7) \ | ||
594 | xor2(&b0, &b6);\ | ||
595 | xor2(&b1, &b4);\ | ||
596 | xor2(&b2, &b0);\ | ||
597 | xor2(&b4, &b6);\ | ||
598 | xor2(&b6, &b1);\ | ||
599 | ;\ | ||
600 | xor2(&b1, &b5);\ | ||
601 | xor2(&b5, &b3);\ | ||
602 | xor2(&b2, &b5);\ | ||
603 | xor2(&b3, &b7);\ | ||
604 | xor2(&b7, &b5);\ | ||
605 | ;\ | ||
606 | xor2(&b4, &b7);\ | ||
607 | |||
608 | #define Mul_GF4(x0, x1, y0, y1, t0) \ | ||
609 | copy2(&t0, &y0);\ | ||
610 | xor2(&t0, &y1);\ | ||
611 | and2(&t0, &x0);\ | ||
612 | xor2(&x0, &x1);\ | ||
613 | and2(&x0, &y1);\ | ||
614 | and2(&x1, &y0);\ | ||
615 | xor2(&x0, &x1);\ | ||
616 | xor2(&x1, &t0);\ | ||
617 | |||
618 | #define Mul_GF4_N(x0, x1, y0, y1, t0) \ | ||
619 | copy2(&t0, &y0);\ | ||
620 | xor2(&t0, &y1);\ | ||
621 | and2(&t0, &x0);\ | ||
622 | xor2(&x0, &x1);\ | ||
623 | and2(&x0, &y1);\ | ||
624 | and2(&x1, &y0);\ | ||
625 | xor2(&x1, &x0);\ | ||
626 | xor2(&x0, &t0);\ | ||
627 | |||
628 | #define Mul_GF4_2(x0, x1, x2, x3, y0, y1, t0, t1) \ | ||
629 | copy2(&t0, = y0);\ | ||
630 | xor2(&t0, &y1);\ | ||
631 | copy2(&t1, &t0);\ | ||
632 | and2(&t0, &x0);\ | ||
633 | and2(&t1, &x2);\ | ||
634 | xor2(&x0, &x1);\ | ||
635 | xor2(&x2, &x3);\ | ||
636 | and2(&x0, &y1);\ | ||
637 | and2(&x2, &y1);\ | ||
638 | and2(&x1, &y0);\ | ||
639 | and2(&x3, &y0);\ | ||
640 | xor2(&x0, &x1);\ | ||
641 | xor2(&x2, &x3);\ | ||
642 | xor2(&x1, &t0);\ | ||
643 | xor2(&x3, &t1);\ | ||
644 | |||
645 | #define Mul_GF16(x0, x1, x2, x3, y0, y1, y2, y3, t0, t1, t2, t3) \ | ||
646 | copy2(&t0, &x0);\ | ||
647 | copy2(&t1, &x1);\ | ||
648 | Mul_GF4(x0, x1, y0, y1, t2);\ | ||
649 | xor2(&t0, &x2);\ | ||
650 | xor2(&t1, &x3);\ | ||
651 | xor2(&y0, &y2);\ | ||
652 | xor2(&y1, &y3);\ | ||
653 | Mul_GF4_N(t0, t1, y0, y1, t2);\ | ||
654 | Mul_GF4(x2, x3, y2, y3, t3);\ | ||
655 | ;\ | ||
656 | xor2(&x0, &t0);\ | ||
657 | xor2(&x2, &t0);\ | ||
658 | xor2(&x1, &t1);\ | ||
659 | xor2(&x3, &t1);\ | ||
660 | |||
661 | #define Mul_GF16_2(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, t0, t1, t2, t3) \ | ||
662 | copy2(&t0, &x0);\ | ||
663 | copy2(&t1, &x1);\ | ||
664 | Mul_GF4(x0, x1, y0, y1, t2);\ | ||
665 | xor2(&t0, &x2);\ | ||
666 | xor2(&t1, &x3);\ | ||
667 | xor2(&y0, &y2);\ | ||
668 | xor2(&y1, &y3);\ | ||
669 | Mul_GF4_N(t0, t1, y0, y1, t3);\ | ||
670 | Mul_GF4(x2, x3, y2, y3, t2);\ | ||
671 | ;\ | ||
672 | xor2(&x0, &t0);\ | ||
673 | xor2(&x2, &t0);\ | ||
674 | xor2(&x1, &t1);\ | ||
675 | xor2(&x3, &t1);\ | ||
676 | ;\ | ||
677 | copy2(&t0, &x4);\ | ||
678 | copy2(&t1, &x5);\ | ||
679 | xor2(&t0, &x6);\ | ||
680 | xor2(&t1, &x7);\ | ||
681 | Mul_GF4_N(t0, t1, y0, y1, t3);\ | ||
682 | Mul_GF4(x6, x7, y2, y3, t2);\ | ||
683 | xor2(&y0, &y2);\ | ||
684 | xor2(&y1, &y3);\ | ||
685 | Mul_GF4(x4, x5, y0, y1, t3);\ | ||
686 | ;\ | ||
687 | xor2(&x4, &t0);\ | ||
688 | xor2(&x6, &t0);\ | ||
689 | xor2(&x5, &t1);\ | ||
690 | xor2(&x7, &t1);\ | ||
691 | |||
692 | #define Inv_GF16(x0, x1, x2, x3, t0, t1, t2, t3) \ | ||
693 | copy2(&t0, &x1);\ | ||
694 | copy2(&t1, &x0);\ | ||
695 | and2(&t0, &x3);\ | ||
696 | or2(&t1, &x2);\ | ||
697 | copy2(&t2, &x1);\ | ||
698 | copy2(&t3, &x0);\ | ||
699 | or2(&t2, &x2);\ | ||
700 | or2(&t3, &x3);\ | ||
701 | xor2(&t2, &t3);\ | ||
702 | ;\ | ||
703 | xor2(&t0, &t2);\ | ||
704 | xor2(&t1, &t2);\ | ||
705 | ;\ | ||
706 | Mul_GF4_2(x0, x1, x2, x3, t1, t0, t2, t3);\ | ||
707 | |||
708 | |||
709 | #define Inv_GF256(x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2, t3, s0, s1, s2, s3) \ | ||
710 | copy2(&t3, &x4);\ | ||
711 | copy2(&t2, &x5);\ | ||
712 | copy2(&t1, &x1);\ | ||
713 | copy2(&s1, &x7);\ | ||
714 | copy2(&s0, &x0);\ | ||
715 | ;\ | ||
716 | xor2(&t3, &x6);\ | ||
717 | xor2(&t2, &x7);\ | ||
718 | xor2(&t1, &x3);\ | ||
719 | xor2(&s1, &x6);\ | ||
720 | xor2(&s0, &x2);\ | ||
721 | ;\ | ||
722 | copy2(&s2, &t3);\ | ||
723 | copy2(&t0, &t2);\ | ||
724 | copy2(&s3, &t3);\ | ||
725 | ;\ | ||
726 | or2(&t2, &t1);\ | ||
727 | or2(&t3, &s0);\ | ||
728 | xor2(&s3, &t0);\ | ||
729 | and2(&s2, &s0);\ | ||
730 | and2(&t0, &t1);\ | ||
731 | xor2(&s0, &t1);\ | ||
732 | and2(&s3, &s0);\ | ||
733 | copy2(&s0, &x3);\ | ||
734 | xor2(&s0, &x2);\ | ||
735 | and2(&s1, &s0);\ | ||
736 | xor2(&t3, &s1);\ | ||
737 | xor2(&t2, &s1);\ | ||
738 | copy2(&s1, &x4);\ | ||
739 | xor2(&s1, &x5);\ | ||
740 | copy2(&s0, &x1);\ | ||
741 | copy2(&t1, &s1);\ | ||
742 | xor2(&s0, &x0);\ | ||
743 | or2(&t1, &s0);\ | ||
744 | and2(&s1, &s0);\ | ||
745 | xor2(&t0, &s1);\ | ||
746 | xor2(&t3, &s3);\ | ||
747 | xor2(&t2, &s2);\ | ||
748 | xor2(&t1, &s3);\ | ||
749 | xor2(&t0, &s2);\ | ||
750 | xor2(&t1, &s2);\ | ||
751 | copy2(&s0, &x7);\ | ||
752 | copy2(&s1, &x6);\ | ||
753 | copy2(&s2, &x5);\ | ||
754 | copy2(&s3, &x4);\ | ||
755 | and2(&s0, &x3);\ | ||
756 | and2(&s1, &x2);\ | ||
757 | and2(&s2, &x1);\ | ||
758 | or2(&s3, &x0);\ | ||
759 | xor2(&t3, &s0);\ | ||
760 | xor2(&t2, &s1);\ | ||
761 | xor2(&t1, &s2);\ | ||
762 | xor2(&t0, &s3);\ | ||
763 | ;\ | ||
764 | copy2(&s0, &t3);\ | ||
765 | xor2(&s0, &t2);\ | ||
766 | and2(&t3, &t1);\ | ||
767 | copy2(&s2, &t0);\ | ||
768 | xor2(&s2, &t3);\ | ||
769 | copy2(&s3, &s0);\ | ||
770 | and2(&s3, &s2);\ | ||
771 | xor2(&s3, &t2);\ | ||
772 | copy2(&s1, &t1);\ | ||
773 | xor2(&s1, &t0);\ | ||
774 | xor2(&t3, &t2);\ | ||
775 | and2(&s1, &t3);\ | ||
776 | xor2(&s1, &t0);\ | ||
777 | xor2(&t1, &s1);\ | ||
778 | copy2(&t2, &s2);\ | ||
779 | xor2(&t2, &s1);\ | ||
780 | and2(&t2, &t0);\ | ||
781 | xor2(&t1, &t2);\ | ||
782 | xor2(&s2, &t2);\ | ||
783 | and2(&s2, &s3);\ | ||
784 | xor2(&s2, &s0);\ | ||
785 | ;\ | ||
786 | Mul_GF16_2(x0, x1, x2, x3, x4, x5, x6, x7, s3, s2, s1, t1, s0, t0, t2, t3);\ | ||
787 | |||
788 | #endif | ||