Wire Sysio Wire Sysion 1.0.0
Loading...
Searching...
No Matches
zm2.cpp
Go to the documentation of this file.
1/*
2 bn::Fp is a finite field with characteristic 254-bit prime integer
3*/
4#include <iostream>
5#include "zm.h"
6#ifdef MIE_USE_X64ASM
7#define XBYAK_NO_OP_NAMES
8#include "xbyak/xbyak.h"
9#include "xbyak/xbyak_util.h"
11#endif
12#include "bn.h"
13#if defined(_MSC_VER) && (_MSC_VER <= 1500)
14typedef unsigned char uint8_t;
15#else
16#include <stdint.h>
17#endif
18
19using namespace bn;
20
21#ifdef DEBUG_COUNT
22extern int g_count_m256;
23extern int g_count_r512;
24extern int g_count_add256;
25#endif
26
27// for C
28// r = (1 << 256) % p
29// rr = r^(-1) % p
30mie::Vuint Fp::p_;
31mie::Vuint Fp::montgomeryR_;
32mie::Vuint Fp::p_add1_div4_;
33Fp Fp::montgomeryR2_;
34Fp Fp::one_;
35Fp Fp::invTbl_[512];
37static mie::Vuint pN;
38
39/*
40 p = 0x2523648240000001,ba344d8000000008,6121000000000013,a700000000000013
41 N = 1 << 256
42 6p < N, 7p > N
43 s_pTbl[i] = ip for i < 7
44*/
45
46const size_t pTblSize = 10;
47const size_t pNtblSize = 4;
55
56static Fp *s_pTbl;
58Fp *Fp::quarterTbl_;
60
62
63static inline void Fp_addC(Fp& out, const Fp& x, const Fp& y)
64{
65 static const mie::Vuint p(&s_pTbl[1][0], Fp::N);
66 mie::Vuint a(&x[0], Fp::N), b(&y[0], Fp::N);
67 a += b;
68 if (a >= p) {
69 a -= p;
70 }
71 Fp::setDirect(out, a);
72}
73
74static inline void Fp_addNC_C(Fp& out, const Fp& x, const Fp& y)
75{
76 mie::Vuint a(&x[0], Fp::N), b(&y[0], Fp::N);
77 a += b;
78 Fp::setDirect(out, a);
79}
80
81static inline void Fp_subNC_C(Fp& out, const Fp& x, const Fp& y)
82{
83 mie::Vuint a(&x[0], Fp::N), b(&y[0], Fp::N);
84 a -= b;
85 Fp::setDirect(out, a);
86}
87
88static inline void Fp_subC(Fp& out, const Fp& x, const Fp& y)
89{
90 static const mie::Vuint p(&s_pTbl[1][0], Fp::N);
91 mie::Vuint a(&x[0], Fp::N), b(&y[0], Fp::N);
92 if (a < b) {
93 a = a + p - b;
94 } else {
95 a -= p;
96 }
97 Fp::setDirect(out, a);
98}
99
100static inline void Fp_mulC(Fp& out, const Fp& x, const Fp& y)
101{
102 Fp_emu a(x.get()), b(y.get());
103 a *= b;
104 out.set(a.get());
105}
106
108static mie::Unit pp_mont;
109
110static void Fp_negC(Fp& out, const Fp& x)
111{
112 static const Fp zero(0);
113 Fp::sub(out, zero, x);
114}
115
116void (*Fp::add)(Fp& out, const Fp& x, const Fp& y) = &Fp_addC;
117void (*Fp::addNC)(Fp& out, const Fp& x, const Fp& y) = &Fp_addNC_C;
118void (*Fp::subNC)(Fp& out, const Fp& x, const Fp& y) = &Fp_subNC_C;
119void (*Fp::shr1)(Fp& out, const Fp& x) = 0;
120void (*Fp::shr2)(Fp& out, const Fp& x) = 0;
121void (*Fp::sub)(Fp& out, const Fp& x, const Fp& y) = &Fp_subC;
122void (*Fp::neg)(Fp& out, const Fp& x) = &Fp_negC;
123void (*Fp::mul)(Fp& out, const Fp& x, const Fp& y) = &Fp_mulC;
124int (*Fp::preInv)(Fp& r, const Fp& x) = 0;
125
126const Fp& Fp::getDirectP(int n)
127{
128 assert(0 <= n && (size_t)n < pTblSize);
129 return s_pTbl[n];
130}
131
132static void FpDbl_addC(FpDbl &z, const FpDbl &x, const FpDbl &y)
133{
134 mie::Vuint a(x.const_ptr(), Fp::N * 2);
135 mie::Vuint b(y.const_ptr(), Fp::N * 2);
136
137 assert(a < pN);
138 assert(b < pN);
139 a += b;
140 if (a >= pN) {
141 a -= pN;
142 }
143 z.setDirect(a);
144}
145
146static void FpDbl_addNC_C(FpDbl &z, const FpDbl &x, const FpDbl &y)
147{
148 mie::Vuint a(x.const_ptr(), Fp::N * 2);
149 mie::Vuint b(y.const_ptr(), Fp::N * 2);
150
151 a += b;
152 z.setDirect(a);
153}
154
155static void FpDbl_negC(FpDbl &z, const FpDbl &x)
156{
157 mie::Vuint a(x.const_ptr(), Fp::N * 2);
158 assert(a < pN);
159 z.setDirect(a.isZero() ? a : pN - a);
160}
161
162static void FpDbl_subC(FpDbl &z, const FpDbl &x, const FpDbl &y)
163{
164 mie::Vuint a(x.const_ptr(), Fp::N * 2);
165 mie::Vuint b(y.const_ptr(), Fp::N * 2);
166
167 assert(a < pN);
168 assert(b < pN);
169
170 if (a < b) {
171 a += pN;
172 }
173 a -= b;
174 z.setDirect(a);
175}
176
177static void FpDbl_subNC_C(FpDbl &z, const FpDbl &x, const FpDbl &y)
178{
179 mie::Vuint a(x.const_ptr(), Fp::N * 2);
180 mie::Vuint b(y.const_ptr(), Fp::N * 2);
181
182 a -= b;
183 z.setDirect(a);
184}
185
186static void FpDbl_mulC(FpDbl &z, const Fp &x, const Fp &y)
187{
188 mie::Vuint a(&x[0], Fp::N);
189 mie::Vuint b(&y[0], Fp::N);
190 a *= b;
191 z.setDirect(a);
192}
193
194static void FpDbl_modC(Fp& out, const FpDbl& x)
195{
196 const size_t UnitLen = sizeof(mie::Unit) * 8;
197 mie::Vuint c(x.const_ptr(), Fp::N * 2);
198 const mie::Vuint& p =Fp::getModulo();
199
200 const size_t n = 256 / UnitLen;
201 for (size_t i = 0; i < n; i++) {
202 mie::Unit u = c[0];
203 mie::Unit q = u * pp_mont;
204 c += q * p;
205 c >>= UnitLen;
206 }
207 if (c >= p) {
208 c -= p;
209 }
210 Fp::setDirect(out, c);
211}
212
213FpDbl::bin_op *FpDbl::add = &FpDbl_addC;
214FpDbl::bin_op *FpDbl::addNC = &FpDbl_addNC_C;
215FpDbl::uni_op *FpDbl::neg = &FpDbl_negC;
216FpDbl::bin_op *FpDbl::sub = &FpDbl_subC;
217FpDbl::bin_op *FpDbl::subNC = &FpDbl_subNC_C;
218void (*FpDbl::mul)(Dbl&, const Fp&, const Fp&) = &FpDbl_mulC;
219void (*FpDbl::mod)(Fp&, const Dbl&) = &FpDbl_modC;
220
221
222#ifdef MIE_USE_X64ASM
223using namespace Xbyak;
224
225struct CpuExt {
226 int type;
227 int model;
228 int family;
229 int stepping;
230 int extModel;
231 int extFamily;
232 int displayFamily;
233 int displayModel;
234 CpuExt()
235 {
236 unsigned int data[4];
238 stepping = data[0] & mask(4);
239 model = (data[0] >> 4) & mask(4);
240 family = (data[0] >> 8) & mask(4);
241 type = (data[0] >> 12) & mask(2);
242 extModel = (data[0] >> 16) & mask(4);
243 extFamily = (data[0] >> 20) & mask(8);
244 if (family == 0x0f) {
245 displayFamily = family + extFamily;
246 } else {
247 displayFamily = family;
248 }
249 if (family == 6 || family == 0x0f) {
250 displayModel = (extModel << 4) + model;
251 } else {
252 displayModel = model;
253 }
254 }
255 unsigned int mask(int n) const
256 {
257 return (1U << n) - 1;
258 }
259};
260
261/*
262 -m 1 = interleaveLoad : true
263 -m 0 = interleaveLoad : false
264
265 interleaveLoad is fast
266 cpu vendor=intel, family=6, model=7, extFamily=0, extModel=1, stepping=6 ; core2duo
267 cpu vendor=intel, family=6, model=12, extFamily=0, extModel=2, stepping=2 ; Xeon X5650
268 cpu vendor=intel, family=6, model=14, extFamily=0, extModel=1, stepping=5 ; Core i7 860
269 cpu vendor=intel, family=6, model=5, extFamily=0, extModel=2, stepping=2 ; Core i5 M 520
270
271
272 interleaveLoad is slow
273 cpu vendor=intel, family=6, model=10, extFamily=0, extModel=2, stepping=7 ; core i7 2620
274 cpu vendor=amd, family=15, model=4, extFamily=1, extModel=0, stepping=2 ; Opteron 2376
275*/
276bool interleaveLoad = false;
277bool g_useMulx = false;
278
279void detectCpu(int mode, bool useMulx)
280{
281 using namespace Xbyak::util;
283 CpuExt ext;
284 bool isIntel = cpu.has(Xbyak::util::Cpu::tINTEL);
285// printf("cpu vendor=%s, ", isIntel ? "intel" : "amd");
286// printf("family=%d, model=%d, extFamily=%d, extModel=%d, stepping=%d\n", ext.family, ext.model, ext.extFamily, ext.extModel, ext.stepping);
287// if (isIntel) printf("dislpayFamily=%02xh, displayModel=%02xh\n", ext.displayFamily, ext.displayModel);
288 switch (mode) {
289 case 0:
290 interleaveLoad = false;
291 break;
292 case 1:
293 interleaveLoad = true;
294 break;
295 default:
296 interleaveLoad = true;
297 if (!isIntel || (ext.family == 6 && ext.displayModel == 0x2a)) {
298 interleaveLoad = false;
299 }
300// printf("-m %d option is selected, but try -m %d to verify the determination.\n", interleaveLoad, 1 - interleaveLoad);
301 break;
302 }
303 if (cpu.has(Xbyak::util::Cpu::tBMI2)) {
304 g_useMulx = useMulx;
305 if (g_useMulx) {
306// fprintf(stderr, "use mulx\n");
307 }
308 } else {
309 g_useMulx = false;
310 }
311// printf("interleaveLoad=%d\n", interleaveLoad);
312}
313
314// for debug
315static Xbyak::util::Cpu s_cpu;
317int debug_counter;
318struct PutDebugCounter {
319 ~PutDebugCounter()
320 {
321 if (debug_counter) printf("debug_counter=%d\n", debug_counter);
322 }
323} s_putDebugCounter;
324
325struct PairingCode;
326template<class Code = PairingCode>
327struct MakeStackFrame {
328 Code *code_;
329 int P_;
330 MakeStackFrame(Code *code, int gtn, int numQword = 0)
331 : code_(code)
332 , P_(code_->storeReg(gtn, numQword))
333 {
334 code_->isRaxP_ = false;
335 }
336 ~MakeStackFrame()
337 {
338 code_->restoreReg(P_);
339 code_->ret();
340 }
341};
342
343/*
344 Ext1, Ext2, Ext6 are classes to calculate offset and size
345*/
346template<class F>
347struct Ext1 {
348 Ext1(const Reg64& r, int n = 0)
349 : r_(r)
350 , n_(n)
351 , next(sizeof(F) + n)
352 {
353 }
354 operator RegExp() const { return r_ + n_; }
355 const Reg64& r_;
356 const int n_;
357 const int next;
358private:
359 Ext1(const Ext1&);
360 void operator=(const Ext1&);
361};
362
363template<class F>
364struct Ext2 {
365 Ext2(const Reg64& r, int n = 0)
366 : r_(r)
367 , n_(n)
368 , next(sizeof(F) * 2 + n)
369 , a_(r, n)
370 , b_(r, n + sizeof(F))
371 {
372 }
373 operator RegExp() const { return r_ + n_; }
374 const Reg64& r_;
375 const int n_;
376 const int next;
377 Ext1<F> a_;
378 Ext1<F> b_;
379private:
380 Ext2(const Ext2&);
381 void operator=(const Ext2&);
382};
383
384template<class F>
385struct Ext6 {
386 Ext6(const Reg64& r, int n = 0)
387 : r_(r)
388 , n_(n)
389 , next(sizeof(F) * 6 + n)
390 , a_(r, n)
391 , b_(r, n + sizeof(F) * 2)
392 , c_(r, n + sizeof(F) * 4)
393 {
394 }
395 operator RegExp() const { return r_ + n_; }
396 const Reg64& r_;
397 const int n_;
398 const int next;
399 Ext2<F> a_;
400 Ext2<F> b_;
401 Ext2<F> c_;
402private:
403 Ext6(const Ext6&);
404 void operator=(const Ext6&);
405};
406
407template<class F>
408struct Ext12 {
409 Ext12(const Reg64& r, int n = 0)
410 : r_(r)
411 , n_(n)
412 , next(sizeof(F) * 12 + n)
413 , a_(r, n)
414 , b_(r, n + sizeof(F) * 6)
415 {
416 }
417 operator RegExp() const { return r_ + n_; }
418 const Reg64& r_;
419 const int n_;
420 const int next;
421 Ext6<F> a_;
422 Ext6<F> b_;
423private:
424 Ext12(const Ext12&);
425 void operator=(const Ext12&);
426};
427
428struct PairingCode : Xbyak::CodeGenerator {
429 /*
430 [z3:z2:z1:z0] = [m3:m2:m1:m0]
431 */
432 void load_rm(const Reg64& z3, const Reg64& z2, const Reg64& z1, const Reg64& z0,
433 const RegExp& m)
434 {
435 mov(z0, ptr [m + 8 * 0]);
436 mov(z1, ptr [m + 8 * 1]);
437 mov(z2, ptr [m + 8 * 2]);
438 mov(z3, ptr [m + 8 * 3]);
439 }
440 /*
441 [z3:z2:z1:z0] = [m3:m2:m1:m0]
442 */
443 void store_mr(const RegExp& m, const Reg64& x3, const Reg64& x2, const Reg64& x1, const Reg64& x0)
444 {
445 mov(ptr [m + 8 * 0], x0);
446 mov(ptr [m + 8 * 1], x1);
447 mov(ptr [m + 8 * 2], x2);
448 mov(ptr [m + 8 * 3], x3);
449 }
450 /*
451 [z3:z2:z1:z0] += [x3:x2:x1:x0]
452 */
453 void add_rr(const Reg64& z3, const Reg64& z2, const Reg64& z1, const Reg64& z0,
454 const Reg64& x3, const Reg64& x2, const Reg64& x1, const Reg64& x0)
455 {
456 add(z0, x0);
457 adc(z1, x1);
458 adc(z2, x2);
459 adc(z3, x3);
460 }
461 /*
462 [z3:z2:z1:z0] += [m3:m2:m1:m0]
463 */
464 void add_rm(const Reg64& z3, const Reg64& z2, const Reg64& z1, const Reg64& z0,
465 const RegExp& m)
466 {
467 add(z0, ptr [m + 8 * 0]);
468 adc(z1, ptr [m + 8 * 1]);
469 adc(z2, ptr [m + 8 * 2]);
470 adc(z3, ptr [m + 8 * 3]);
471 }
472#ifdef DEBUG_COUNT
473 void upCount(int *count)
474 {
475 push(rax);
476 mov(rax, (size_t)count);
477 inc(qword[rax]);
478 pop(rax);
479 }
480#endif
481 /*
482 [z3:z2:z1:z0] += [m3:m2:m1:m0] with carry
483 */
484 void adc_rm(const Reg64& z3, const Reg64& z2, const Reg64& z1, const Reg64& z0,
485 const RegExp& m)
486 {
487 adc(z0, ptr [m + 8 * 0]);
488 adc(z1, ptr [m + 8 * 1]);
489 adc(z2, ptr [m + 8 * 2]);
490 adc(z3, ptr [m + 8 * 3]);
491 }
492 void load_add_rm(const Reg64& z3, const Reg64& z2, const Reg64& z1, const Reg64& z0,
493 const RegExp& mx, const RegExp& my, bool withCarry)
494 {
495#ifdef DEBUG_COUNT
496 upCount(&g_count_add256);
497#endif
498 if (interleaveLoad) {
499 mov(z0, ptr [mx + 8 * 0]);
500 if (withCarry) {
501 adc(z0, ptr [my + 8 * 0]);
502 } else {
503 add(z0, ptr [my + 8 * 0]);
504 }
505 mov(z1, ptr [mx + 8 * 1]);
506 adc(z1, ptr [my + 8 * 1]);
507 mov(z2, ptr [mx + 8 * 2]);
508 adc(z2, ptr [my + 8 * 2]);
509 mov(z3, ptr [mx + 8 * 3]);
510 adc(z3, ptr [my + 8 * 3]);
511 } else {
512 load_rm(z3, z2, z1, z0, mx);
513 if (withCarry) {
514 adc_rm(z3, z2, z1, z0, my);
515 } else {
516 add_rm(z3, z2, z1, z0, my);
517 }
518 }
519 }
520 void load_sub_rm(const Reg64& z3, const Reg64& z2, const Reg64& z1, const Reg64& z0,
521 const RegExp& mx, const RegExp& my, bool withCarry)
522 {
523#ifdef DEBUG_COUNT
524 upCount(&g_count_add256);
525#endif
526 if (interleaveLoad) {
527 mov(z0, ptr [mx + 8 * 0]);
528 if (withCarry) {
529 sbb(z0, ptr [my + 8 * 0]);
530 } else {
531 sub(z0, ptr [my + 8 * 0]);
532 }
533 mov(z1, ptr [mx + 8 * 1]);
534 sbb(z1, ptr [my + 8 * 1]);
535 mov(z2, ptr [mx + 8 * 2]);
536 sbb(z2, ptr [my + 8 * 2]);
537 mov(z3, ptr [mx + 8 * 3]);
538 sbb(z3, ptr [my + 8 * 3]);
539 } else {
540 load_rm(z3, z2, z1, z0, mx);
541 if (withCarry) {
542 sbb_rm(z3, z2, z1, z0, my);
543 } else {
544 sub_rm(z3, z2, z1, z0, my);
545 }
546 }
547 }
548 /*
549 [z3:z2:z1:z0] -= [x3:x2:x1:x0]
550 */
551 void sub_rr(const Reg64& z3, const Reg64& z2, const Reg64& z1, const Reg64& z0,
552 const Reg64& x3, const Reg64& x2, const Reg64& x1, const Reg64& x0)
553 {
554 sub(z0, x0);
555 sbb(z1, x1);
556 sbb(z2, x2);
557 sbb(z3, x3);
558 }
559 /*
560 [z3:z2:z1:z0] -= [m3:m2:m1:m0]
561 */
562 void sub_rm(const Reg64& z3, const Reg64& z2, const Reg64& z1, const Reg64& z0,
563 const RegExp& m)
564 {
565 sub(z0, ptr [m + 8 * 0]);
566 sbb(z1, ptr [m + 8 * 1]);
567 sbb(z2, ptr [m + 8 * 2]);
568 sbb(z3, ptr [m + 8 * 3]);
569 }
570 /*
571 [z3:z2:z1:z0] -= [m3:m2:m1:m0] with carry
572 */
573 void sbb_rm(const Reg64& z3, const Reg64& z2, const Reg64& z1, const Reg64& z0,
574 const RegExp& m)
575 {
576 sbb(z0, ptr [m + 8 * 0]);
577 sbb(z1, ptr [m + 8 * 1]);
578 sbb(z2, ptr [m + 8 * 2]);
579 sbb(z3, ptr [m + 8 * 3]);
580 }
581 void in_Fp_add_carry(const RegExp& mz, const RegExp& mx, const RegExp& my, bool withCarry)
582 {
583 if (interleaveLoad) {
584 mov(gt1, ptr [mx + 8 * 0]);
585 if (withCarry) {
586 adc(gt1, ptr [my + 8 * 0]);
587 } else {
588 add(gt1, ptr [my + 8 * 0]);
589 }
590 mov(ptr [mz + 8 * 0], gt1);
591
592 mov(gt2, ptr [mx + 8 * 1]);
593 adc(gt2, ptr [my + 8 * 1]);
594 mov(ptr [mz + 8 * 1], gt2);
595
596 mov(gt3, ptr [mx + 8 * 2]);
597 adc(gt3, ptr [my + 8 * 2]);
598 mov(ptr [mz + 8 * 2], gt3);
599
600 mov(gt4, ptr [mx + 8 * 3]);
601 adc(gt4, ptr [my + 8 * 3]);
602 mov(ptr [mz + 8 * 3], gt4);
603 } else {
604 load_add_rm(gt4, gt3, gt2, gt1, mx, my, withCarry);
605 store_mr(mz, gt4, gt3, gt2, gt1);
606 }
607 }
608 void in_Fp_sub_carry(const RegExp& mz, const RegExp& mx, const RegExp& my, bool withCarry)
609 {
610 if (interleaveLoad) {
611 mov(gt1, ptr [mx + 8 * 0]);
612 if (withCarry) {
613 sbb(gt1, ptr [my + 8 * 0]);
614 } else {
615 sub(gt1, ptr [my + 8 * 0]);
616 }
617 mov(ptr [mz + 8 * 0], gt1);
618
619 mov(gt2, ptr [mx + 8 * 1]);
620 sbb(gt2, ptr [my + 8 * 1]);
621 mov(ptr [mz + 8 * 1], gt2);
622
623 mov(gt3, ptr [mx + 8 * 2]);
624 sbb(gt3, ptr [my + 8 * 2]);
625 mov(ptr [mz + 8 * 2], gt3);
626
627 mov(gt4, ptr [mx + 8 * 3]);
628 sbb(gt4, ptr [my + 8 * 3]);
629 mov(ptr [mz + 8 * 3], gt4);
630 } else {
631 load_sub_rm(gt4, gt3, gt2, gt1, mx, my, withCarry);
632 store_mr(mz, gt4, gt3, gt2, gt1);
633 }
634 }
635 void in_Fp_addNC(const RegExp& mz, const RegExp& mx, const RegExp& my)
636 {
637 in_Fp_add_carry(mz, mx, my, false);
638 }
639 void in_Fp_subNC(const RegExp& mz, const RegExp& mx, const RegExp& my)
640 {
641 in_Fp_sub_carry(mz, mx, my, false);
642 }
643 void in_Fp_adcNC(const RegExp& mz, const RegExp& mx, const RegExp& my)
644 {
645 in_Fp_add_carry(mz, mx, my, true);
646 }
647 void in_Fp_sbbNC(const RegExp& mz, const RegExp& mx, const RegExp& my)
648 {
649 in_Fp_sub_carry(mz, mx, my, true);
650 }
651 /*
652 gp1 = mz
653 gp2 = mx
654 gp3 = my
655 */
656 void smart_set_gp(const RegExp& mz, const RegExp& mx, const RegExp& my)
657 {
658 lea(gp1, ptr [mz]);
659 if (mx == mz) {
660 mov(gp2, gp1);
661 } else {
662 lea(gp2, ptr [mx]);
663 }
664 if (my == mz) {
665 mov(gp3, gp1);
666 } else if (my == mx) {
667 mov(gp3, gp2);
668 } else {
669 lea(gp3, ptr [my]);
670 }
671 }
672 void in_FpDbl_addNC(const RegExp& mz, const RegExp& mx, const RegExp& my)
673 {
674 smart_set_gp(mz, mx, my);
675 call(p_FpDbl_addNC);
676 }
677 void in_FpDbl_subNC(const RegExp& mz, const RegExp& mx, const RegExp& my)
678 {
679 smart_set_gp(mz, mx, my);
680 call(p_FpDbl_subNC);
681 }
682 void make_Fp_addNC(int n)
683 {
684#ifdef _WIN32
685 const Reg64& z = rcx;
686 const Reg64& x = rdx;
687 const Reg64& y = r8;
688#else
689 const Reg64& z = rdi;
690 const Reg64& x = rsi;
691 const Reg64& y = rdx;
692#endif
693 const Reg64& z2 = r9;
694 const Reg64& z1 = r10;
695 const Reg64& z0 = r11;
696 for (int i = 0; i < n; i++) {
697 load_add_rm(rax, z2, z1, z0, x + 32 * i, y + 32 * i, false);
698 store_mr(z + 32 * i, rax, z2, z1, z0);
699 }
700 ret();
701 }
702 void make_Fp_subNC()
703 {
704#ifdef _WIN32
705 const Reg64& z = rcx;
706 const Reg64& x = rdx;
707 const Reg64& y = r8;
708#else
709 const Reg64& z = rdi;
710 const Reg64& x = rsi;
711 const Reg64& y = rdx;
712#endif
713 const Reg64& z2 = r9;
714 const Reg64& z1 = r10;
715 const Reg64& z0 = r11;
716 load_sub_rm(x, z2, z1, z0, x, y, false);
717 store_mr(z, x, z2, z1, z0);
718 ret();
719 }
720
721 /*
722 input rax = &s_pTbl[1], gt[4, 3, 2, 1]
723 output gt[4, 3, 2, 1] mod p
724 destroy gt5, gt6, gt7, rdx
725 */
726 void in_Fp_add_modp()
727 {
728 mov(gt5, gt1);
729 mov(gt6, gt2);
730 mov(gt7, gt3);
731 mov(rdx, gt4);
732
733 sub_rm(gt4, gt3, gt2, gt1, rax);
734#if 1 // faster@sandy 1.36Mclk
735 cmovc(gt1, gt5);
736 cmovc(gt2, gt6);
737 cmovc(gt3, gt7);
738 cmovc(gt4, rdx);
739#else // 1.39Mclk
740 jnc("@f");
741 mov(gt1, gt5);
742 mov(gt2, gt6);
743 mov(gt3, gt7);
744 mov(gt4, rdx);
745 L("@@");
746#endif
747 }
748 /*
749 input rax : &s_pTbl[1]
750 destroy gt1, ..., gt7, rdx
751 */
752 void in_Fp_add(const RegExp& mz, const RegExp& mx, const RegExp& my)
753 {
754 load_add_rm(gt4, gt3, gt2, gt1, mx, my, false);
755
756 in_Fp_add_modp();
757 store_mr(mz, gt4, gt3, gt2, gt1);
758 }
759 /*
760 input rax = &s_pTbl[1], gt[4, 3, 2, 1]
761 output gt[4, 3, 2, 1] mod p
762 destroy gt5, gt6, gt7, rdx
763 */
764 void in_Fp_sub_modp()
765 {
766#if 1
767#if 1 // 1.36Mclk
768 sbb(rdx, rdx);
769 mov(gt5, rdx);
770 mov(gt6, rdx);
771 mov(gt7, rdx);
772 and_(rdx, qword [rax + 8 * 0]);
773 and_(gt5, qword [rax + 8 * 1]);
774 and_(gt6, qword [rax + 8 * 2]);
775 and_(gt7, qword [rax + 8 * 3]);
776#else
777 // 1.37Mclk
778 mov(rdx, 0);
779 mov(gt5, rdx);
780 mov(gt6, rdx);
781 mov(gt7, rdx);
782 cmovc(rdx, qword [rax + 8 * 0]);
783 cmovc(gt5, qword [rax + 8 * 1]);
784 cmovc(gt6, qword [rax + 8 * 2]);
785 cmovc(gt7, qword [rax + 8 * 3]);
786#endif
787 add_rr(gt4, gt3, gt2, gt1, gt7, gt6, gt5, rdx);
788#else
789 jnc("@f");
790 add_rm(gt4, gt3, gt2, gt1, rax);
791 L("@@");
792#endif
793 }
794 /*
795 input rax : &s_pTbl[1]
796 destroy gt1, ..., gt7, rdx
797 */
798 void in_Fp_sub(const RegExp& mz, const RegExp& mx, const RegExp& my)
799 {
800 load_sub_rm(gt4, gt3, gt2, gt1, mx, my, false);
801 in_Fp_sub_modp();
802 store_mr(mz, gt4, gt3, gt2, gt1);
803 }
804 /*
805 destroy gt1, ..., gt7, rdx, rax
806 */
807 void in_FpDbl_add(const RegExp& mz, const RegExp& mx, const RegExp& my)
808 {
809 mov(rax, (uint64_t)&s_pTbl[1]);
810 in_Fp_addNC(mz, mx, my);
811 load_add_rm(gt4, gt3, gt2, gt1, mx + sizeof(Fp), my + sizeof(Fp), true);
812 in_Fp_add_modp();
813 store_mr(mz + 32, gt4, gt3, gt2, gt1);
814 }
815 /*
816 destroy gt1, ..., gt7, rdx, rax
817 */
818 void sub_FpDbl_sub(const RegExp& mz, const RegExp& mx, const RegExp& my)
819 {
820 mov(rax, (uint64_t)&s_pTbl[1]);
821 in_Fp_subNC(mz, mx, my);
822 load_sub_rm(gt4, gt3, gt2, gt1, mx + sizeof(Fp), my + sizeof(Fp), true);
823 in_Fp_sub_modp();
824 store_mr(mz + 32, gt4, gt3, gt2, gt1);
825 }
826 void in_FpDbl_sub(const RegExp& mz, const RegExp& mx, const RegExp& my)
827 {
828 smart_set_gp(mz, mx, my);
829 call(p_FpDbl_sub);
830 }
831 void set_p_FpDbl_add()
832 {
833 align(16);
834 p_FpDbl_add = (void*)const_cast<uint8_t*>(getCurr());
835 in_FpDbl_add(gp1, gp2, gp3);
836 ret();
837 }
838 void set_p_FpDbl_addNC()
839 {
840 align(16);
841 p_FpDbl_addNC = (void*)const_cast<uint8_t*>(getCurr());
842 in_Fp_addNC(gp1, gp2, gp3);
843 in_Fp_adcNC(gp1 + 32, gp2 + 32, gp3 + 32);
844 ret();
845 }
846 void set_p_FpDbl_subNC()
847 {
848 align(16);
849 p_FpDbl_subNC = (void*)const_cast<uint8_t*>(getCurr());
850 in_Fp_subNC(gp1, gp2, gp3);
851 in_Fp_sbbNC(gp1 + 32, gp2 + 32, gp3 + 32);
852 ret();
853 }
854 void in_Fp2Dbl_add(const RegExp& mz, const RegExp& mx, const RegExp& my)
855 {
856 smart_set_gp(mz, mx, my);
857 call(p_FpDbl_add);
858 add(gp1, 64);
859 add(gp2, 64);
860 add(gp3, 64);
861 call(p_FpDbl_add);
862 }
863 void set_p_FpDbl_sub()
864 {
865 align(16);
866 p_FpDbl_sub = (void*)const_cast<uint8_t*>(getCurr());
867 sub_FpDbl_sub(gp1, gp2, gp3);
868 ret();
869 }
870 void in_Fp2Dbl_sub(const RegExp& mz, const RegExp& mx, const RegExp& my)
871 {
872 smart_set_gp(mz, mx, my);
873 call(p_FpDbl_sub);
874 add(gp1, 64);
875 add(gp2, 64);
876 add(gp3, 64);
877 call(p_FpDbl_sub);
878 }
879 void in_Fp_add(int n, const RegExp& mz, const RegExp& mx, const RegExp& my)
880 {
881 mov(rax, (uint64_t)&s_pTbl[1]);
882 for (int i = 0; i < n; i++) {
883 in_Fp_add(mz + 32 * i, mx + 32 * i, my + 32 * i);
884 }
885 }
886
887 void in_Fp_neg(const RegExp& mz, const RegExp& mx)
888 {
889 load_rm(gt4, gt3, gt2, gt1, mx);
890 mov(rdx, gt1);
891 or_(rdx, gt2);
892 or_(rdx, gt3);
893 or_(rdx, gt4);
894 jz("@f");
895 load_sub_rm(gt4, gt3, gt2, gt1, rax, mx, false);
896L("@@");
897 store_mr(mz, gt4, gt3, gt2, gt1);
898 }
899 void in_Fp_neg(int n, const RegExp& mz, const RegExp& mx)
900 {
901 mov(rax, (uint64_t)&s_pTbl[1]);
902 for (int i = 0; i < n; i++) {
903 in_Fp_neg(mz + 32 * i, mx + 32 * i);
904 }
905 }
906 void in_Fp2_neg(const RegExp& mz, const RegExp& mx)
907 {
908 // smart_set_gp for only two arguments.
909 lea(gp1, ptr [mz]);
910 if (mx == mz) {
911 mov(gp2, gp1);
912 } else {
913 lea(gp2, ptr [mx]);
914 }
915
916 call(p_Fp2_neg);
917 }
918 void set_p_Fp2_neg()
919 {
920 align(16);
921 p_Fp2_neg = (void*)const_cast<uint8_t*>(getCurr());
922 in_Fp_neg(2, gp1, gp2);
923 ret();
924 }
925
926 void in_Fp2_add(const RegExp& mz, const RegExp& mx, const RegExp& my)
927 {
928 smart_set_gp(mz, mx, my);
929 call(p_Fp2_add);
930 }
931 void in_Fp2_sub(const RegExp& mz, const RegExp& mx, const RegExp& my)
932 {
933 smart_set_gp(mz, mx, my);
934 call(p_Fp2_sub);
935 }
936 void set_p_Fp2_addNC()
937 {
938 align(16);
939 p_Fp2_addNC = (void*)const_cast<uint8_t*>(getCurr());
940 in_Fp_addNC(gp1, gp2, gp3);
941 in_Fp_addNC(gp1 + 32, gp2 + 32, gp3 + 32);
942 ret();
943 }
944 void set_p_Fp2_add()
945 {
946 align(16);
947 p_Fp2_add = (void*)const_cast<uint8_t*>(getCurr());
948 in_Fp_add(2, gp1, gp2, gp3);
949 ret();
950 }
951 void set_p_Fp2_sub()
952 {
953 align(16);
954 p_Fp2_sub = (void*)const_cast<uint8_t*>(getCurr());
955 in_Fp_sub(2, gp1, gp2, gp3);
956 ret();
957 }
958 void in_Fp2_addNC(const RegExp& mz, const RegExp& mx, const RegExp& my)
959 {
960 smart_set_gp(mz, mx, my);
961 call(p_Fp2_addNC);
962 }
963 void in_Fp_sub(int n, const RegExp& mz, const RegExp& mx, const RegExp& my)
964 {
965 mov(rax, (uint64_t)&s_pTbl[1]);
966 for (int i = 0; i < n; i++) {
967 in_Fp_sub(mz + 32 * i, mx + 32 * i, my + 32 * i);
968 }
969 }
970 void in_FpDbl_add(int n, const RegExp& mz, const RegExp& mx, const RegExp& my)
971 {
972 for (int i = 0; i < n; i++) {
973 in_FpDbl_add(mz + 64 * i, mx + 64 * i, my + 64 * i);
974 }
975 }
976 void in_FpDbl_addNC(int n, const RegExp& mz, const RegExp& mx, const RegExp& my)
977 {
978 for (int i = 0; i < n; i++) {
979 in_FpDbl_addNC(mz + 64 * i, mx + 64 * i, my + 64 * i);
980 }
981 }
982 void in_FpDbl_sub(int n, const RegExp& mz, const RegExp& mx, const RegExp& my)
983 {
984 for (int i = 0; i < n; i++) {
985 sub_FpDbl_sub(mz + 64 * i, mx + 64 * i, my + 64 * i);
986 }
987 }
988
989 /*
990 add(uint64_t z[4], const uint64_t x[4], const uint64_t y[4]);
991 z[3..0] = (y[3..0] + x[3..0]) % p
992 @note accept z == y or z == x
993 */
994 void make_Fp_add(int n)
995 {
996 MakeStackFrame<> sf(this, 7);
997 in_Fp_add(n, gp1, gp2, gp3);
998 }
999 void make_Fp_sub(int n)
1000 {
1001 MakeStackFrame<> sf(this, 7);
1002 in_Fp_sub(n, gp1, gp2, gp3);
1003 }
1004 void set_p_Fp6_add()
1005 {
1006 align(16);
1007 p_Fp6_add = (void*)const_cast<uint8_t*>(getCurr());
1008 in_Fp_add(6, gp1, gp2, gp3);
1009 ret();
1010 }
1011 void make_Fp6_add()
1012 {
1013 MakeStackFrame<> sf(this, 7);
1014 call(p_Fp6_add);
1015 }
1016 void set_p_Fp6_sub()
1017 {
1018 align(16);
1019 p_Fp6_sub = (void*)const_cast<uint8_t*>(getCurr());
1020 in_Fp_sub(6, gp1, gp2, gp3);
1021 ret();
1022 }
1023 void make_Fp6_sub()
1024 {
1025 MakeStackFrame<> sf(this, 7);
1026 call(p_Fp6_sub);
1027 }
1028
1029 void make_Fp_neg()
1030 {
1031#ifdef _WIN32
1032 const Reg64& z = rcx;
1033 const Reg64& x = rdx;
1034#else
1035 const Reg64& z = rdi;
1036 const Reg64& x = rsi;
1037#endif
1038 const Reg64& z3 = r8;
1039 const Reg64& z2 = r9;
1040 const Reg64& z1 = r10;
1041 const Reg64& z0 = r11;
1042
1043 load_rm(z3, z2, z1, z0, x);
1044 mov(rax, z0);
1045 or_(rax, z1);
1046 or_(rax, z2);
1047 or_(rax, z3);
1048 jz("@f");
1049 mov(rax, (uint64_t)&s_pTbl[1]);
1050 load_sub_rm(z3, z2, z1, z0, rax, x, false);
1051 L("@@");
1052 store_mr(z, z3, z2, z1, z0);
1053 ret();
1054 }
1055 /*
1056 [x3:x2:x1:x0] >>= n
1057 */
1058 void shrn(const Reg64& x3, const Reg64& x2, const Reg64& x1, const Reg64& x0, uint8 n)
1059 {
1060 shrd(x0, x1, n); // x0 = [x1:x0] >> n
1061 shrd(x1, x2, n); // x1 = [x2:x1] >> n
1062 shrd(x2, x3, n); // x2 = [x3:x2] >> n
1063 shr(x3, n); // x3 >> n
1064 }
1065 /*
1066 [x3:x2:x1:x0] >>= 1
1067 */
1068 void shr1(const Reg64& x3, const Reg64& x2, const Reg64& x1, const Reg64& x0)
1069 {
1070 shrn(x3, x2, x1, x0, 1);
1071 }
1072 /*
1073 [x3:x2:x1:x0] <<= 1
1074 */
1075 void shl1(const Reg64& x3, const Reg64& x2, const Reg64& x1, const Reg64& x0)
1076 {
1077 add_rr(x3, x2, x1, x0, x3, x2, x1, x0);
1078 }
1079 /*
1080 input : gp1, gp2, rax = s_pTbl[1]
1081 destroy : rdx, gt1, ..., gt7
1082 */
1083 void sub_Fp_divBy2(const RegExp& z, const RegExp& x)
1084 {
1085 mov(rdx, ptr [x]);
1086 and_(rdx, 1); // x[0] & 1
1087 shl(rdx, 5); // * 32
1088 load_rm(gt4, gt3, gt2, gt1, x);
1089 shr1(gt4, gt3, gt2, gt1);
1090 mov(gt5, (uint64_t)Fp::halfTbl_);
1091 add(rdx, gt5);
1092 add_rm(gt4, gt3, gt2, gt1, rdx);
1093 store_mr(z, gt4, gt3, gt2, gt1);
1094 }
1095 void set_p_Fp2_divBy2()
1096 {
1097 align(16);
1098 p_Fp2_divBy2 = (void*)const_cast<uint8_t*>(getCurr());
1099 const Reg64& z = gp1;
1100 const Reg64& x = gp2;
1101 mov(rax, (uint64_t)&s_pTbl[1]);
1102 sub_Fp_divBy2(z, x);
1103 sub_Fp_divBy2(z + 32, x + 32);
1104 ret();
1105 }
1106 void make_Fp2_divBy2()
1107 {
1108 MakeStackFrame<> sf(this, 7);
1109 call(p_Fp2_divBy2);
1110 }
1111 /*
1112 x[3:2:1:0] <<= 1
1113 */
1114 void shl1(const Reg64& x, const Reg64& t)
1115 {
1116 mov(t, ptr [x + 8 * 0]);
1117 add(ptr [x + 8 * 0], t);
1118 mov(t, ptr [x + 8 * 1]);
1119 adc(ptr [x + 8 * 1], t);
1120 mov(t, ptr [x + 8 * 2]);
1121 adc(ptr [x + 8 * 2], t);
1122 mov(t, ptr [x + 8 * 3]);
1123 adc(ptr [x + 8 * 3], t);
1124 }
1125 void make_Fp_shr(uint8 n = 1)
1126 {
1127#ifdef _WIN32
1128 const Reg64& z = rcx;
1129 const Reg64& x = rdx;
1130#else
1131 const Reg64& z = rdi;
1132 const Reg64& x = rsi;
1133#endif
1134 const Reg64& z3 = r8;
1135 const Reg64& z2 = r9;
1136 const Reg64& z1 = r10;
1137 const Reg64& z0 = r11;
1138 load_rm(z3, z2, z1, z0, x);
1139 shrn(z3, z2, z1, z0, n);
1140 store_mr(z, z3, z2, z1, z0);
1141 ret();
1142 }
1143 /*
1144 [d:x:t2:t1:t0] <- py[3:2:1:0] * x
1145 destroy x, t
1146 */
1147 void mul4x1(const RegExp& py, const Reg64& x, const Reg64& t3, const Reg64& t2, const Reg64& t1, const Reg64& t0,
1148 const Reg64& t)
1149 {
1150 const Reg64& a = rax;
1151 const Reg64& d = rdx;
1152 if (g_useMulx) {
1153 mov(d, x);
1154 mulx(t1, t0, ptr [py + 8 * 0]);
1155 mulx(t2, a, ptr [py + 8 * 1]);
1156 add(t1, a);
1157 mulx(x, a, ptr [py + 8 * 2]);
1158 adc(t2, a);
1159 mulx(d, a, ptr [py + 8 * 3]);
1160 adc(x, a);
1161 adc(d, 0);
1162 return;
1163 }
1164 mov(a, ptr [py]);
1165 mul(x);
1166 mov(t0, a);
1167 mov(t1, d);
1168 mov(a, ptr [py + 8]);
1169 mul(x);
1170 mov(t, a);
1171 mov(t2, d);
1172 mov(a, ptr [py + 8 * 2]);
1173 mul(x);
1174 mov(t3, a);
1175 mov(a, x);
1176 mov(x, d);
1177 mul(qword [py + 8 * 3]);
1178 add(t1, t);
1179 adc(t2, t3);
1180 adc(x, a);
1181 adc(d, 0);
1182 }
1183
1184 /*
1185 c = [c4:c3:c2:c1:c0]
1186 c += x[3..0] * y
1187 q = uint64_t(c0 * pp)
1188 c = (c + q * p) >> 64
1189 input [c4:c3:c2:c1:c0], px, y, p
1190 output [c0:c4:c3:c2:c1]
1191
1192 @note use rax, rdx, destroy y
1193 @note max([c4:c3:c2:c1:c0]) = 2p - 1, ie. c4 = 0 or 1
1194 */
1195 void montgomery1(const Reg64& c4, const Reg64& c3, const Reg64& c2, const Reg64& c1, const Reg64& c0,
1196 const Reg64& px, const Reg64& y, const Reg64& p,
1197 const Reg64& t0, const Reg64& t1, const Reg64& t2, const Reg64& t3, const Reg64& t4, bool isFirst)
1198 {
1199 const Reg64& a = rax;
1200 const Reg64& d = rdx;
1201 if (isFirst) {
1202 mul4x1(px, y, c3, c2, c1, c0, c4);
1203 mov(c4, d);
1204 // [c4:y:c2:c1:c0] = px[3..0] * y
1205 } else {
1206 mul4x1(px, y, t3, t2, t1, t0, t4);
1207 // [d:y:t2:t1:t0] = px[3..0] * y
1208 add_rr(y, c2, c1, c0, c3, t2, t1, t0);
1209 adc(c4, d);
1210 }
1211 mov(rax, pp_);
1212 mul(c0); // q = a
1213 mov(c3, a);
1214 mul4x1(p, c3, t3, t2, t1, t0, t4);
1215 add(c0, t0);
1216// mov(c0, 0); // c0 is always zero because Montgomery reduction
1217 adc(c1, t1);
1218 adc(c2, t2);
1219 adc(c3, y);
1220 adc(c4, d);
1221 adc(c0, 0);
1222 }
1223 /*
1224 input (z, x, y) = (gp1, gp2, gp3)
1225 z[0..3] <- montgomery(x[0..3], y[0..3])
1226 destroy gt1, ..., gt10, xm0, xm1, gp3
1227 */
1228 void Fp_mul()
1229 {
1230#ifdef DEBUG_COUNT
1231 upCount(&g_count_m256);
1232 upCount(&g_count_r512);
1233#endif
1234 movq(xm0, gp1); // save gp1
1235 mov(gp1, (uint64_t)&s_pTbl[1]);
1236 movq(xm1, gp3);
1237 mov(gp3, ptr [gp3]);
1238 montgomery1(gt1, gt8, gt4, gt3, gt2, gp2, gp3, gp1, gt5, gt6, gt7, gt9, gt10, true);
1239
1240 movq(gp3, xm1);
1241 mov(gp3, ptr [gp3 + 8]);
1242 montgomery1(gt2, gt1, gt8, gt4, gt3, gp2, gp3, gp1, gt5, gt6, gt7, gt9, gt10, false);
1243
1244 movq(gp3, xm1);
1245 mov(gp3, ptr [gp3 + 16]);
1246 montgomery1(gt3, gt2, gt1, gt8, gt4, gp2, gp3, gp1, gt5, gt6, gt7, gt9, gt10, false);
1247
1248 movq(gp3, xm1);
1249 mov(gp3, ptr [gp3 + 24]);
1250 montgomery1(gt4, gt3, gt2, gt1, gt8, gp2, gp3, gp1, gt5, gt6, gt7, gt9, gt10, false);
1251 // [gt8:gt4:gt3:gt2:gt1]
1252
1253 mov(gt5, gt1);
1254 mov(gt6, gt2);
1255 mov(gt7, gt3);
1256 mov(rdx, gt4);
1257 sub_rm(gt4, gt3, gt2, gt1, gp1);
1258 cmovc(gt1, gt5);
1259 cmovc(gt2, gt6);
1260 cmovc(gt3, gt7);
1261 cmovc(gt4, rdx);
1262
1263 movq(gp1, xm0); // load gp1
1264 store_mr(gp1, gt4, gt3, gt2, gt1);
1265 }
1266 void set_p_Fp_mul()
1267 {
1268 align(16);
1269 // (gp1, gp2, gp3), destory gp3
1270 p_Fp_mul = (void*)const_cast<uint8_t*>(getCurr());
1271 Fp_mul();
1272 ret();
1273 }
1274 void set_p_FpDbl_mod()
1275 {
1276 align(16);
1277 p_FpDbl_mod = (void*)const_cast<uint8_t*>(getCurr());
1278 mont_mod();
1279 ret();
1280 }
1281 // call:32
1282 void make_Fp_mul()
1283 {
1284 MakeStackFrame<> sf(this, 10);
1285 call(p_Fp_mul);
1286 }
1287 // call:32
1288 void make_Fp2_mul_Fp_0()
1289 {
1290 MakeStackFrame<> sf(this, 10);
1291 movq(xm2, gp3);
1292 call(p_Fp_mul); // mul(z.a_, x.a_, b);
1293 movq(gp3, xm2);
1294 add(gp1, sizeof(Fp));
1295 add(gp2, sizeof(Fp));
1296 call(p_Fp_mul); // mul(z.b_, x.b_, b);
1297 }
1298
1299 /*
1300 pz[7..0] <- px[3..0] * py[3..0]
1301 */
1302 void mul4x4(const RegExp& pz, const RegExp& px, const RegExp& py,
1303 const Reg64& t9, const Reg64& t8, const Reg64& t7, const Reg64& t6, const Reg64& t5, const Reg64& t4, const Reg64& t3, const Reg64& t2, const Reg64& t1, const Reg64& t0)
1304 {
1305#ifdef DEBUG_COUNT
1306 upCount(&g_count_m256);
1307#endif
1308 const Reg64& a = rax;
1309 const Reg64& d = rdx;
1310
1311 if (g_useMulx) {
1312 mov(d, ptr [px]);
1313 mulx(t0, a, ptr [py + 8 * 0]);
1314 mov(ptr [pz + 8 * 0], a);
1315 mulx(t1, a, ptr [py + 8 * 1]);
1316 add(t0, a);
1317 mulx(t2, a, ptr [py + 8 * 2]);
1318 adc(t1, a);
1319 mulx(t3, a, ptr [py + 8 * 3]);
1320 adc(t2, a);
1321 adc(t3, 0);
1322 } else {
1323 mov(t5, ptr [px]);
1324 mov(a, ptr [py + 8 * 0]);
1325 mul(t5);
1326 mov(ptr [pz + 8 * 0], a);
1327 mov(t0, d);
1328 mov(a, ptr [py + 8 * 1]);
1329 mul(t5);
1330 mov(t3, a);
1331 mov(t1, d);
1332 mov(a, ptr [py + 8 * 2]);
1333 mul(t5);
1334 mov(t4, a);
1335 mov(t2, d);
1336 mov(a, ptr [py + 8 * 3]);
1337 mul(t5);
1338 add(t0, t3);
1339 mov(t3, 0);
1340 adc(t1, t4);
1341 adc(t2, a);
1342 adc(t3, d); // [t3:t2:t1:t0:pz[0]] = px[0] * py[3..0]
1343 }
1344
1345 // here [t3:t2:t1:t0]
1346
1347 mov(t9, ptr [px + 8]);
1348
1349 // [d:t9:t7:t6:t5] = px[1] * py[3..0]
1350 mul4x1(py, t9, t8, t7, t6, t5, t4);
1351 add_rr(t3, t2, t1, t0, t9, t7, t6, t5);
1352 adc(d, 0);
1353 mov(t8, d);
1354 mov(ptr [pz + 8], t0);
1355 // here [t8:t3:t2:t1]
1356
1357 mov(t9, ptr [px + 16]);
1358
1359 // [d:t9:t6:t5:t4]
1360 mul4x1(py, t9, t7, t6, t5, t4, t0);
1361 add_rr(t8, t3, t2, t1, t9, t6, t5, t4);
1362 adc(d, 0);
1363 mov(t7, d);
1364 mov(ptr [pz + 16], t1);
1365
1366 mov(t9, ptr [px + 24]);
1367
1368 // [d:t9:t5:t4:t1]
1369 mul4x1(py, t9, t6, t5, t4, t1, t0);
1370 add_rr(t7, t8, t3, t2, t9, t5, t4, t1);
1371 adc(d, 0);
1372 store_mr(pz + 8 * 3, t7, t8, t3, t2);
1373 mov(ptr [pz + 8 * 7], d);
1374 }
1375
1376 /*
1377 @input (z, x) = (gp1, gp2)
1378 z[3..0] = Montgomery reduction(x[7..0])
1379 @note destroy rax, rdx, gt1, ..., gt10, gp3, xm0, xm1
1380 */
1381 void mont_mod()
1382 {
1383#ifdef DEBUG_COUNT
1384 upCount(&g_count_r512);
1385#endif
1386 const Reg64& a = rax;
1387 const Reg64& d = rdx;
1388
1389 movq(xm0, gp1);
1390 mov(gp1, ptr [gp2 + 8 * 0]);
1391
1392 mov(a, pp_);
1393 mul(gp1);
1394 mov(gp3, (uint64_t)&s_pTbl[1]);
1395 mov(gt7, a); // q
1396
1397 // [d:gt7:gt3:gt2:gt1] = p * q
1398 mul4x1(gp3, gt7, gt4, gt3, gt2, gt1, gt8);
1399
1400 add(gt1, gp1);
1401 adc(gt2, qword [gp2 + 8 * 1]);
1402 adc(gt3, qword [gp2 + 8 * 2]);
1403 adc(gt7, qword [gp2 + 8 * 3]);
1404 mov(gt4, ptr [gp2 + 8 * 4]);
1405 adc(gt4, d);
1406 mov(gt8, ptr [gp2 + 8 * 5]);
1407 adc(gt8, 0);
1408 mov(gt9, ptr [gp2 + 8 * 6]);
1409 adc(gt9, 0);
1410 mov(gt10, ptr [gp2 + 8 * 7]);
1411 adc(gt10, 0); // c' = [gt10:gt9:gt8:gt4:gt7:gt3:gt2]
1412
1413 // free gp1, gt1, gt5, gp2, gt6
1414
1415 mov(a, pp_);
1416 mul(gt2);
1417 mov(gp1, a); // q
1418
1419 movq(xm1, gt10);
1420 // [d:gp1:gt5:gp2:gt6] = p * q
1421 mul4x1(gp3, gp1, gt1, gt5, gp2, gt6, gt10);
1422 movq(gt10, xm1);
1423
1424 add_rr(gt4, gt7, gt3, gt2, gp1, gt5, gp2, gt6);
1425 adc(gt8, d);
1426 adc(gt9, 0);
1427 adc(gt10, 0); // c' = [gt10:gt9:gt8:gt4:gt7:gt3]
1428
1429 // free gp1, gt1, gt2, gt5, gp2, gt6
1430
1431 mov(a, pp_);
1432 mul(gt3);
1433 mov(gp1, a); // q
1434
1435 // [d:gp1:gt5:gp2:gt6] = p * q
1436 mul4x1(gp3, gp1, gt1, gt5, gp2, gt6, gt2);
1437
1438 add_rr(gt8, gt4, gt7, gt3, gp1, gt5, gp2, gt6);
1439 adc(gt9, d);
1440 adc(gt10, 0); // c' = [gt10:gt9:gt8:gt4:gt7]
1441
1442 // free gp1, gt1, gt2, gt7, gt5, gp2, gt6
1443
1444 mov(a, pp_);
1445 mul(gt7);
1446 mov(gp1, a); // q
1447
1448 // [d:gp1:gt5:gp2:gt6] = p * q
1449 mul4x1(gp3, gp1, gt1, gt5, gp2, gt6, gt2);
1450
1451 add_rr(gt9, gt8, gt4, gt7, gp1, gt5, gp2, gt6);
1452 adc(gt10, d); // c' = [gt10:gt9:gt8:gt4]
1453
1454 mov(gp1, gt4);
1455 mov(gt1, gt8);
1456 mov(gt2, gt9);
1457 mov(gt3, gt10);
1458 sub_rm(gt10, gt9, gt8, gt4, gp3);
1459 cmovc(gt4, gp1);
1460 cmovc(gt8, gt1);
1461 cmovc(gt9, gt2);
1462 cmovc(gt10, gt3);
1463
1464 movq(gp1, xm0);
1465 store_mr(gp1, gt10, gt9, gt8, gt4);
1466 }
1467#ifdef BN_SUPPORT_SNARK
1468 /*
1469 [mz] = ([mx] * 9 + [my]) mod p if doAdd is true
1470 [mz] = ([mx] * 9 + p - [my]) mod p if doAdd is false
1471 */
1472 void in_Fp_mul_xi_addsub(const RegExp& mz, const RegExp& mx, const RegExp& my, bool doAdd)
1473 {
1474 /*
1475 require p * 10 < (1<<258) because sizeof(s_pTbl[0]) == 32
1476 x *= 9
1477 pTop = (p >> 193) + 1 # 0x183227397098d015
1478 pRev = (1<<124) / pTop # 0xa948e8c4c474094e
1479 def f(x):
1480 return ((x>>193) * pRev) >> 124
1481 */
1482 const uint64_t pRev = uint64_t(0xa948e8c4c474094eLL);
1483 mov(gt4, 9);
1484 // [d:gt4:gt3:gt2:gt1] = [mx] * 9
1485 mul4x1(mx, gt4, gt5, gt3, gt2, gt1, gt6);
1486 if (doAdd) {
1487 add_rm(gt4, gt3, gt2, gt1, my);
1488 adc(d, 0);
1489 } else {
1490 mov(rax, (uint64_t)&s_pTbl[1]);
1491 add_rm(gt4, gt3, gt2, gt1, rax);
1492 adc(d, 0); // [mx] * 9 + p
1493 sub_rm(gt4, gt3, gt2, gt1, my);
1494 sbb(d, 0); // [mx] * 9 + p - [my]
1495 }
1496 // d = [d:gt4] >> 1 = x >> 193
1497 shld(d, gt4, 63);
1498 mov(rax, pRev);
1499 mul(d);
1500 shr(d, 60); // f(x)
1501 shl(d, 5);
1502 mov(rax, (uint64_t)&s_pTbl[1]);
1503 // use only 256bit value(d is not necessary)
1504 sub_rm(gt4, gt3, gt2, gt1, rax - 32 + rdx); // 0 <= [gt4:gt3:gt2:gt1] < 2p
1505 in_Fp_add_modp();
1506 store_mr(mz, gt4, gt3, gt2, gt1);
1507 }
1508#endif
1509
1510 void in_Fp2_mul_xi(const RegExp& mz, const RegExp& mx)
1511 {
1512 mov(rax, (uint64_t)&s_pTbl[1]);
1513#ifdef BN_SUPPORT_SNARK
1514#if 1
1515 // 133clk -> 66clk
1516 in_Fp_mul_xi_addsub(mz, mx, mx + 32, false);
1517 in_Fp_mul_xi_addsub(mz + 32, mx + 32, mx, true);
1518#else
1519 in_Fp_add(mz, mx, mx); // 2
1520 in_Fp_add(mz, mz, mz); // 4
1521 in_Fp_add(mz, mz, mz); // 8
1522 in_Fp_add(mz, mz, mx); // 9
1523 in_Fp_sub(mz, mz, mx + 32);
1524
1525 in_Fp_add(mz + 32, mx + 32, mx + 32); // 2
1526 in_Fp_add(mz + 32, mz + 32, mz + 32); // 4
1527 in_Fp_add(mz + 32, mz + 32, mz + 32); // 8
1528 in_Fp_add(mz + 32, mz + 32, mx + 32); // 9
1529 in_Fp_add(mz + 32, mz + 32, mx);
1530#endif
1531#else
1532 in_Fp_sub(mz, mx, mx + 32);
1533 in_Fp_add(mz + 32, mx, mx + 32);
1534#endif
1535 }
1536 void make_Fp2_mul_xi()
1537 {
1538 MakeStackFrame<> sf(this, 7);
1539 in_Fp2_mul_xi(gp1, gp2);
1540 }
1541
1542 /*
1543 destroy : gt1, gt2, gt3, gt4, rdx(, rax)
1544 memo : gp3 is free
1545 */
1546 void in_FpDbl_neg(const RegExp& mz, const RegExp& mx)
1547 {
1548 inLocalLabel();
1549 load_rm(gt4, gt3, gt2, gt1, mx);
1550 mov(rdx, gt4);
1551 or_(rdx, gt3);
1552 or_(rdx, gt2);
1553 or_(rdx, gp1);
1554 load_rm(gt4, gt3, gt2, gt1, mx + 32);
1555 or_(rdx, gt4);
1556 or_(rdx, gt3);
1557 or_(rdx, gt2);
1558 or_(rdx, gp1);
1559#ifdef DEBUG_COUNT
1560 jnz(".neg", T_NEAR);
1561#else
1562 jnz(".neg");
1563#endif
1564 // all zero
1565 store_mr(mz, rdx, rdx, rdx, rdx);
1566 store_mr(mz + 32, rdx, rdx, rdx, rdx);
1567#ifdef DEBUG_COUNT
1568 jmp(".exit", T_NEAR);
1569#else
1570 jmp(".exit");
1571#endif
1572 L(".neg");
1573 mov(rax, (uint64)&s_pTbl[0]); // rax refers to pN, lower 256-bits are zero.
1574 in_Fp_subNC(mz, rax, mx);
1575 in_Fp_sbbNC(mz + 32, rax + 32, mx + 32);
1576 L(".exit");
1577 outLocalLabel();
1578 }
1579 void make_FpDbl_neg()
1580 {
1581 MakeStackFrame<> sf(this, 4);
1582 in_FpDbl_neg(gp1, gp2);
1583 }
1584 void make_Fp2Dbl_neg()
1585 {
1586 MakeStackFrame<> sf(this, 4);
1587 in_FpDbl_neg(gp1, gp2);
1588 in_FpDbl_neg(gp1 + 64, gp2 + 64);
1589 }
1590
1591 /*
1592 pz[7..0] <- (px[7..0] + py[7..0]) mod pN
1593 */
1594 void make_FpDbl_add(int n)
1595 {
1596 MakeStackFrame<> sf(this, 7);
1597 in_FpDbl_add(n, gp1, gp2, gp3);
1598 }
1599 /*
1600 pz[7..0] <- (px[7..0] - py[7..0]) mod pN
1601 */
1602 void make_FpDbl_sub(int n)
1603 {
1604 MakeStackFrame<> sf(this, 7);
1605 in_FpDbl_sub(n, gp1, gp2, gp3);
1606 }
1607
1608 /*
1609 z[7..0] <- x[7..0] + y[7..0],
1610 */
1611 void make_FpDbl_addNC(int n)
1612 {
1613 MakeStackFrame<> sf(this, 7);
1614 in_FpDbl_addNC(n, gp1, gp2, gp3);
1615 }
1616 /*
1617 z[7..0] <- x[7..0] - y[7..0],
1618 */
1619 void make_FpDbl_subNC(int n)
1620 {
1621 MakeStackFrame<> sf(this, 7);
1622 for (int i = 0; i < n; i++) {
1623 in_Fp_subNC(gp1 + 64 * i, gp2 + 64 * i, gp3 + 64 * i);
1624 in_Fp_sbbNC(gp1 + 64 * i + 32, gp2 + 64 * i + 32, gp3 + 64 * i + 32);
1625 }
1626 }
1627 void in_Fp2Dbl_mul_xi(const RegExp& mz, const RegExp& mx)
1628 {
1629 mov(rax, (uint64_t)&s_pTbl[1]);
1630 lea(gp1, ptr [mz]);
1631 lea(gp2, ptr [mx]);
1632 call(p_Fp2Dbl_mul_xi);
1633 }
1634 void make_Fp2Dbl_mul_xi()
1635 {
1636 MakeStackFrame<> sf(this, 7);
1637 call(p_Fp2Dbl_mul_xi);
1638 }
1639
1640 /*
1641 pz[7..0] <- px[3..0] * py[3..0]
1642 */
1643 void make_FpDbl_mul()
1644 {
1645 MakeStackFrame<> sf(this, 10);
1646 mul4x4(gp1, gp2, gp3, gt9, gt8, gt7, gt6, gt5, gt4, gt3, gt2, gt1, gt10);
1647 }
1648
1649 /*
1650 pz[3..0] <- mont_mod(px[7..0])
1651 */
1652 void make_FpDbl_mod()
1653 {
1654 MakeStackFrame<> sf(this, 10);
1655 call(p_FpDbl_mod);
1656 }
1657 /*
1658 use xm0, xm1, xm2
1659 */
1660 void in_Fp2Dbl_mod()
1661 {
1662 movq(xm2, gp2);
1663 call(p_FpDbl_mod);
1664
1665 movq(gp2, xm2);
1666 add(gp2, 32 * 2);
1667 add(gp1, 32);
1668 call(p_FpDbl_mod);
1669 }
1670 /*
1671 pz[3..0] <- mont_mod(px[7..0]),
1672 pz[7..4] <- mont_mod(px[15..8]).
1673 */
1674 void make_Fp2Dbl_mod()
1675 {
1676 MakeStackFrame<> sf(this, 10);
1677 in_Fp2Dbl_mod();
1678 }
1679 /*
1680 input [x3:x2:x1:x0] < 6p
1681 output [x3:x2:x1:x0] % p
1682 destroy rax, rdx, t0, t1, t2, t3
1683 (i*p) >> 253 = 0,1,2,3,4,5,6 for i = 0, .., 6
1684 t = (i * p) >> 253
1685 */
1686 void fast_modp(
1687 const Reg64& x3, const Reg64& x2, const Reg64& x1, const Reg64& x0,
1688 const Reg64& t0, const Reg64& t1, const Reg64& t2, const Reg64& t3)
1689 {
1690 const Reg64& a = rax;
1691 mov(rdx, x3);
1692 shr(rdx, 61); // rdx = [0:x3_63:x3_62:x3_61]
1693
1694 shl(rdx, 5); // sizeof(Fp) = 32
1695 mov(a, (uint64_t)&s_pTbl[0]);
1696 sub_rm(x3, x2, x1, x0, a + rdx);
1697 sbb(rdx, rdx);
1698
1699 load_rm(t3, t2, t1, t0, a + sizeof(Fp));
1700 and_(t0, rdx);
1701 and_(t1, rdx);
1702 and_(t2, rdx);
1703 and_(t3, rdx); // [t3:t2:t1:t0] = x < 0 ? p : 0
1704 add_rr(x3, x2, x1, x0, t3, t2, t1, t0);
1705 }
1706
1707 // (z, x) = (gp1, gp2)
1708 // call:295
1709 void set_p_Fp2_square()
1710 {
1711 align(16);
1712 p_Fp2_square = (void*)const_cast<uint8_t*>(getCurr());
1713 in_Fp2_square();
1714 ret();
1715 }
1716 // 278clk x 295
1717 void in_Fp2_square()
1718 {
1719//begin_clock();
1720 const Ext2<Fp> z(gp1);
1721 const Ext2<Fp> x(gp2);
1722 const Ext1<Fp> t(rsp);
1723 const Ext1<FpDbl> d0(rsp, t.next);
1724 const Ext1<FpDbl> d1(rsp, d0.next);
1725 const int SS = d1.next;
1726 sub(rsp, SS);
1727
1728#ifdef BN_SUPPORT_SNARK
1729 mov(rax, (uint64_t)&s_pTbl[1]);
1730 load_rm(gt4, gt3, gt2, gt1, x.b_);
1731 add_rr(gt4, gt3, gt2, gt1, gt4, gt3, gt2, gt1);
1732 in_Fp_add_modp(); // XITAG
1733 store_mr(t, gt4, gt3, gt2, gt1); // t = 2 * b
1734
1735 // d0 = t[3..0] * a
1736 mul4x4(d0, t, x, gt10, gt9, gt8, gt7, gt6, gt5, gt4, gt3, gt2, gt1);
1737
1738 mov(rax, (uint64_t)&s_pTbl[1]);
1739
1740 load_add_rm(gt4, gt3, gt2, gt1, x.a_, rax, false); // t = a + p
1741 sub_rm(gt4, gt3, gt2, gt1, x.b_); // a + p - b
1742 store_mr(t, gt4, gt3, gt2, gt1); // t = a + p - b
1743
1744 in_Fp_add(z.a_, x.a_, x.b_);
1745#else
1746 load_rm(gt4, gt3, gt2, gt1, x.b_);
1747 add_rr(gt4, gt3, gt2, gt1, gt4, gt3, gt2, gt1);
1748 store_mr(t, gt4, gt3, gt2, gt1); // t = 2 * b
1749
1750 // d0 = t[3..0] * a
1751 mul4x4(d0, t, x, gt10, gt9, gt8, gt7, gt6, gt5, gt4, gt3, gt2, gt1);
1752
1753 mov(rax, (uint64_t)&s_pTbl[1]);
1754
1755 load_add_rm(gt4, gt3, gt2, gt1, x.a_, rax, false); // t = a + p
1756 sub_rm(gt4, gt3, gt2, gt1, x.b_); // a + p - b
1757 store_mr(t, gt4, gt3, gt2, gt1); // t = a + p - b
1758
1759 in_Fp_add_carry(z.a_, x.a_, x.b_, false); // z.a_ = a + b
1760#endif
1761 // d1 = (a + p - b)(a + b)
1762 mul4x4(d1, t, z.a_, gt10, gt9, gt8, gt7, gt6, gt5, gt4, gt3, gt2, gt1);
1763
1764 lea(gp2, ptr [d1]);
1765// mont_mod();
1766 call(p_FpDbl_mod);
1767
1768 lea(gp2, ptr [d0]);
1769 add(gp1, 8 * 4);
1770// mont_mod();
1771 call(p_FpDbl_mod);
1772 add(rsp, SS);
1773//end_clock();
1774 }
1775 /*
1776 square(Fp2& z, const Fp2& x)
1777 z = x * x
1778 */
1779 void make_Fp2_square()
1780 {
1781 MakeStackFrame<> sf(this, 10);
1782 call(p_Fp2_square);
1783 }
1784 /*
1785 pz[7..0] -= px[7..0]
1786 */
1787 void sub_Fp2Dbl_subNC(const RegExp& pz, const RegExp& px,
1788 const Reg64& t3, const Reg64& t2, const Reg64& t1, const Reg64& t0)
1789 {
1790 load_sub_rm(t3, t2, t1, t0, pz, px, false);
1791 store_mr(pz + 8 * 0, t3, t2, t1, t0);
1792
1793 load_sub_rm(t3, t2, t1, t0, pz + sizeof(Fp), px + sizeof(Fp), true);
1794 store_mr(pz + sizeof(Fp), t3, t2, t1, t0);
1795 }
1796
1797 /*
1798 destroy : rax, gt1, ..., gt7
1799 addNC(z, x, pN);
1800 subNC(z, z, y);
1801 */
1802 void in_FpDbl_subOpt1(const RegExp& mz, const RegExp& mx, const RegExp& my)
1803 {
1804 mov(rax, (uint64)&Fp::Dbl::pNTbl_[2]);
1805 load_rm(gt4, gt3, gt2, gt1, mx);
1806 // 192-bits lower value of pNTbl_[2] is zero
1807 add(gt4, ptr [rax + 8 * 3]); // add_rm(gt4, gt3, gt2, gt1, rax + 8 * 3);
1808 load_add_rm(rdx, gt7, gt6, gt5, mx + sizeof(Fp), rax + sizeof(Fp), true);
1809 sub_rm(gt4, gt3, gt2, gt1, my);
1810 store_mr(mz, gt4, gt3, gt2, gt1);
1811 sbb_rm(rdx, gt7, gt6, gt5, my + sizeof(Fp));
1812 store_mr(mz + 32, rdx, gt7, gt6, gt5);
1813 }
1814
1815 // 359clk x 290
1816 void set_p_Fp2_mul()
1817 {
1818 align(16);
1819 p_Fp2_mul = (void*)const_cast<uint8_t*>(getCurr());
1820//begin_clock();
1821
1822 const Ext2<Fp> z(gp1);
1823 const Ext2<Fp> x(gp2);
1824 const Ext2<Fp> y(gp3);
1825
1826 const Ext1<Fp> s(rsp);
1827 const Ext1<Fp> t(rsp, s.next);
1828 const Ext1<FpDbl> d0(rsp, t.next);
1829 const Ext1<FpDbl> d1(rsp, d0.next);
1830 const Ext1<FpDbl> d2(rsp, d1.next);
1831 const int SS = d2.next;
1832 sub(rsp, SS);
1833 // x.a_ + x.b_
1834 in_Fp_addNC(s, x.a_, x.b_);
1835 // y.a_ + y.b_
1836 in_Fp_addNC(t, y.a_, y.b_);
1837
1838 mul4x4(d0, s, t, gt9, gt8, gt7, gt6, gt5, gt4, gt3, gt2, gt1, gt10); // d0 = s * t
1839 mul4x4(d1, x.a_, y.a_, gt9, gt8, gt7, gt6, gt5, gt4, gt3, gt2, gt1, gt10); // d1 = x.a_ * y.a_
1840 mul4x4(d2, x.b_, y.b_, gt9, gt8, gt7, gt6, gt5, gt4, gt3, gt2, gt1, gt10); // d2 = x.b_ * y.b_
1841
1842 // d0 -= d1
1843 sub_Fp2Dbl_subNC(d0, d1, gt3, gt2, gt1, gt10);
1844 // d0 -= d2
1845 sub_Fp2Dbl_subNC(d0, d2, gt3, gt2, gt1, gt10);
1846
1847 sub_FpDbl_sub(d1, d1, d2);
1848
1849 lea(gp2, ptr [d1]);
1850// mont_mod();
1851 call(p_FpDbl_mod);
1852
1853 lea(gp2, ptr [d0]);
1854 add(gp1, sizeof(Fp));
1855// mont_mod();
1856 call(p_FpDbl_mod);
1857 add(rsp, SS);
1858//end_clock();
1859 ret();
1860 }
1861
1862 // mul(Fp2T& z, const Fp2T& x, const Fp2T& y)
1863 void make_Fp2_mul()
1864 {
1865 MakeStackFrame<> sf(this, 10);
1866 call(p_Fp2_mul);
1867 }
1868
1869 // uint64_t preInv(FpT& r, const FpT& x)
1870 void make_Fp_preInv()
1871 {
1872 MakeStackFrame<> sf(this, 10, 4);
1873 const Reg64& r = gp1;
1874 const Reg64& v0 = gp2;
1875 const Reg64& v1 = gp3;
1876 const Reg64& v2 = gt1;
1877 const Reg64& v3 = gt2;
1878 const Reg64& u0 = gt3;
1879 const Reg64& u1 = gt4;
1880 const Reg64& u2 = gt5;
1881 const Reg64& u3 = gt6;
1882 const Reg64& s0 = gt7;
1883 const Reg64& s1 = gt8;
1884 const Reg64& s2 = gt9;
1885 const Reg64& s3 = gt10;
1886 const Reg64& t = rdx;
1887
1888 inLocalLabel();
1889 const Reg64& a = rax;
1890 const Xmm& k = xm4;
1891 const Xmm& one = xm5;
1892 const Xmm& xt0 = xm0;
1893 const Xmm& xt1 = xm1;
1894 const Xmm& xt2 = xm2;
1895 const Xmm& xt3 = xm3;
1896 mov(t, (uint64_t)&s_pTbl[1]);
1897 load_rm(u3, u2, u1, u0, t); // u = p
1898 mov(v3, ptr [v0 + 8 * 3]);
1899 mov(v2, ptr [v0 + 8 * 2]);
1900 mov(v1, ptr [v0 + 8 * 1]);
1901 mov(v0, ptr [v0 + 8 * 0]); // v = x
1902 xor_(s3, s3);
1903 lea(s0, ptr [s3 + 1]);
1904 mov(s1, s3);
1905 mov(s2, s3); // s[3:2:1:0] = 1
1906
1907 // r = [r:a:rsp[1]:rsp[0]]
1908 mov(ptr [rsp + 8 * 0], s3);
1909 mov(ptr [rsp + 8 * 1], s3);
1910 mov(ptr [rsp + 8 * 2], r); // save r
1911 xor_(a, a);
1912 xor_(r, r);
1913
1914 pxor(k, k); // k
1915 pxor(one, one);
1916 movq(one, s0);
1917
1918 align(16);
1919 L(".lp");
1920 mov(t, v0);
1921 or_(t, v1);
1922 or_(t, v2);
1923 or_(t, v3);
1924 jz(".exit", T_NEAR);
1925 test(u0, 1);
1926 jz(".u_even", T_NEAR);
1927 test(v0, 1);
1928 jz(".v_even");
1929 movq(xt0, v0);
1930 movq(xt1, v1);
1931 movq(xt2, v2);
1932 movq(xt3, v3);
1933 sub_rr(v3, v2, v1, v0, u3, u2, u1, u0);
1934 jc(".next3");
1935 add(s0, ptr [rsp + 8 * 0]);
1936 adc(s1, ptr [rsp + 8 * 1]);
1937 adc(s2, a);
1938 adc(s3, r);
1939 L(".v_even");
1940 shr1(v3, v2, v1, v0);
1941 mov(t, ptr [rsp + 8 * 0]);
1942 add(ptr [rsp + 8 * 0], t);
1943 mov(t, ptr [rsp + 8 * 1]);
1944 adc(ptr [rsp + 8 * 1], t);
1945 adc(a, a);
1946 adc(r, r);
1947 paddd(k, one);
1948 jmp(".lp");
1949 align(16);
1950 L(".next3");
1951 movq(v0, xt0);
1952 movq(v1, xt1);
1953 movq(v2, xt2);
1954 movq(v3, xt3);
1955 sub_rr(u3, u2, u1, u0, v3, v2, v1, v0);
1956 add(ptr [rsp + 8 * 0], s0);
1957 adc(ptr [rsp + 8 * 1], s1);
1958 adc(a, s2);
1959 adc(r, s3);
1960 L(".u_even");
1961 shr1(u3, u2, u1, u0);
1962 shl1(s3, s2, s1, s0);
1963 paddd(k, one);
1964 jmp(".lp", T_NEAR);
1965 align(16);
1966 L(".exit");
1967 // r = 2p - r
1968 // if (r >= p) r -= p ; this is unnecessary because next function is mul
1969 mov(t, (uint64_t)&s_pTbl[2]);
1970 load_rm(s3, s2, s1, s0, t);
1971 sub(s0, ptr [rsp + 8 * 0]);
1972 sbb(s1, ptr [rsp + 8 * 1]);
1973 sbb(s2, a);
1974 sbb(s3, r);
1975 mov(r, ptr [rsp + 8 * 2]);
1976 store_mr(r, s3, s2, s1, s0);
1977 movq(rax, k);
1978
1979 outLocalLabel();
1980 }
1981
1982 void begin_clock()
1983 {
1984 mov(gt1, (size_t)&sclk);
1985 rdtsc();
1986 sub(ptr [gt1], eax);
1987 sbb(ptr [gt1 + 4], edx);
1988 }
1989 void end_clock()
1990 {
1991 mov(gt1, (size_t)&sclk);
1992 rdtsc();
1993 add(ptr [gt1], eax);
1994 adc(ptr [gt1 + 4], edx);
1995 inc(dword [gt1 + 8]);
1996 }
1997 /*
1998 mulOpt(Fp2Dbl& z, const Fp2T& x, const Fp2T& y);
1999 input : (pz, px, py) = (gp1, gp2, gp3)
2000 stack : 8 * 16
2001 202clk x 2058
2002 */
2003 void sub_Fp2Dbl_mulOpt(int mode)
2004 {
2005 Ext2<FpDbl> z(gp1);
2006 Ext2<Fp> x(gp2);
2007 Ext2<Fp> y(gp3);
2008 // Fp s, t;
2009 // FpDbl d0;
2010 Ext1<Fp> s(rsp);
2011 Ext1<Fp> t(rsp, s.next);
2012 Ext1<FpDbl> d0(rsp, t.next);
2013 const int SS = d0.next;
2014 sub(rsp, SS);
2015
2016 // x.a_ + x.b_
2017 in_Fp_addNC(s, x.a_, x.b_);
2018 // y.a_ + y.b_
2019 in_Fp_addNC(t, y.a_, y.b_);
2020
2021 mul4x4(d0, x.b_, y.b_, gt9, gt8, gt7, gt6, gt5, gt4, gt3, gt2, gt1, gt10); // d2 = x.b_ * y.b_
2022 mul4x4(z.a_, x.a_, y.a_, gt9, gt8, gt7, gt6, gt5, gt4, gt3, gt2, gt1, gt10); // d1 = x.a_ * y.a_
2023 mul4x4(z.b_, s, t, gt9, gt8, gt7, gt6, gt5, gt4, gt3, gt2, gt1, gt10); // d0 = s * t
2024
2025 // d0 -= d1(subNC)
2026 load_sub_rm(gt3, gt2, gt1, gt10, z.b_, z.a_, false);
2027
2028 load_sub_rm(gt7, gt6, gt5, gt4, (RegExp)z.b_ + sizeof(Fp), (RegExp)z.a_ + sizeof(Fp), true);
2029 // d0 -= d2(subNC)
2030 sub_rm(gt3, gt2, gt1, gt10, d0);
2031 sbb_rm(gt7, gt6, gt5, gt4, (RegExp)d0 + sizeof(Fp));
2032
2033 // set return value z.b_
2034 store_mr(z.b_, gt3, gt2, gt1, gt10);
2035 store_mr((RegExp)z.b_ + sizeof(Fp), gt7, gt6, gt5, gt4);
2036
2037 if (mode == 1) {
2038 // call:606
2039 in_FpDbl_subOpt1(z.a_, z.a_, d0);
2040 } else {
2041 // call:1452
2042// in_FpDbl_sub(z.a_, z.a_, d0);
2043 sub_FpDbl_sub(z.a_, z.a_, d0);
2044 }
2045 add(rsp, SS);
2046 ret();
2047 }
2048 void set_p_Fp2Dbl_mulOpt(int mode)
2049 {
2050 align(16);
2051 switch (mode) {
2052 case 1:
2053 p_Fp2Dbl_mulOpt1 = (void*)const_cast<uint8_t*>(getCurr());
2054 break;
2055 case 2:
2056 p_Fp2Dbl_mulOpt2 = (void*)const_cast<uint8_t*>(getCurr());
2057 break;
2058 default:
2059 printf("err set_p_Fp2Dbl_mulOpt mode=%d\n", mode);
2060 }
2061 sub_Fp2Dbl_mulOpt(mode);
2062 }
2063 void set_p_Fp2Dbl_mul_xi()
2064 {
2065 align(16);
2066 p_Fp2Dbl_mul_xi = (void*)const_cast<uint8_t*>(getCurr());
2067 mov(rax, (uint64_t)&s_pTbl[1]);
2068#ifdef BN_SUPPORT_SNARK
2069 in_FpDbl_add(gp1, gp2, gp2); // 2
2070 in_FpDbl_add(gp1, gp1, gp1); // 4
2071 in_FpDbl_add(gp1, gp1, gp1); // 8
2072 in_FpDbl_add(gp1, gp1, gp2); // 9
2073 sub_FpDbl_sub(gp1, gp1, gp2 + sizeof(FpDbl));
2074
2075 in_FpDbl_add(gp1 + 64, gp2 + sizeof(FpDbl), gp2 + sizeof(FpDbl)); // 2
2076 in_FpDbl_add(gp1 + 64, gp1 + 64, gp1 + 64); // 4
2077 in_FpDbl_add(gp1 + 64, gp1 + 64, gp1 + 64); // 8
2078 in_FpDbl_add(gp1 + 64, gp1 + 64, gp2 + sizeof(FpDbl)); // 9
2079 in_FpDbl_add(gp1 + 64, gp1 + 64, gp2);
2080#else
2081 sub_FpDbl_sub(gp1, gp2, gp2 + sizeof(FpDbl));
2082 in_FpDbl_add(gp1 + 64, gp2 + sizeof(FpDbl), gp2);
2083#endif
2084 ret();
2085 }
2086
2087 // Fp2::Dbl::mulOpt1(Dbl &z, const Fp2T &x, const Fp2T &y) h = 2
2088 void make_Fp2Dbl_mulOpt(int mode)
2089 {
2090 MakeStackFrame<> sf(this, 10);
2091 if (mode == 1) {
2092 call(p_Fp2Dbl_mulOpt1);
2093 } else {
2094 call(p_Fp2Dbl_mulOpt2);
2095 }
2096 }
2097
2098 /*
2099 Fp6::mul(Fp6Dbl& z, const Fp6T& x, const Fp6T& y);
2100 input (z, x, y) = (xm3, xm4, xm5)
2101 ca::194
2102 */
2103 void set_p_Fp6Dbl_mul()
2104 {
2105 align(16);
2106 p_Fp6Dbl_mul = (void*)const_cast<uint8_t*>(getCurr());
2107
2108 // Fp2 t0, t1;
2109 // Fp2Dbl T0, T1, T2;
2110
2111 const Ext6<FpDbl> z(gt8);
2112 const Ext6<Fp> x(gt9);
2113 const Ext6<Fp> y(gt10);
2114 const Ext2<Fp> t0(rsp);
2115 const Ext2<Fp> t1(rsp, t0.next);
2116 const Ext2<FpDbl> T0(rsp, t1.next);
2117 const Ext2<FpDbl> T1(rsp, T0.next);
2118 const Ext2<FpDbl> T2(rsp, T1.next);
2119 const int SS = T2.next;
2120
2121 sub(rsp, SS);
2122 const Xmm& zsave = xm3;
2123 const Xmm& xsave = xm4;
2124 const Xmm& ysave = xm5;
2125 movq(z.r_, zsave);
2126 movq(x.r_, xsave);
2127 movq(y.r_, ysave);
2128
2129 // Fp2Dbl::mulOpt1(T0, x.a_, y.a_);
2130 lea(gp1, ptr [T0]);
2131 lea(gp2, ptr [x.a_]);
2132 lea(gp3, ptr [y.a_]);
2133 call(p_Fp2Dbl_mulOpt1);
2134
2135 // Fp2Dbl::mulOpt1(T1, x.b_, y.b_);
2136 movq(x.r_, xsave);
2137 movq(y.r_, ysave);
2138 lea(gp1, ptr [T1]);
2139 lea(gp2, ptr [x.b_]);
2140 lea(gp3, ptr [y.b_]);
2141 call(p_Fp2Dbl_mulOpt1);
2142
2143 // Fp2Dbl::mulOpt1(T2, x.c_, y.c_);
2144 movq(x.r_, xsave);
2145 movq(y.r_, ysave);
2146 lea(gp1, ptr [T2]);
2147 lea(gp2, ptr [x.c_]);
2148 lea(gp3, ptr [y.c_]);
2149 call(p_Fp2Dbl_mulOpt1);
2150
2151 // Fp2::addNC(t0, x.b_, x.c_);
2152 movq(x.r_, xsave);
2153 movq(y.r_, ysave);
2154 in_Fp2_addNC(t0, x.b_, x.c_);
2155 // Fp2::addNC(t1, y.b_, y.c_);
2156 in_Fp2_addNC(t1, y.b_, y.c_);
2157
2158 // Fp2Dbl::mulOpt2(z.c_, t0, t1);
2159 movq(z.r_, zsave);
2160 lea(gp1, ptr [z.c_]);
2161 lea(gp2, ptr [t0]);
2162 lea(gp3, ptr [t1]);
2163 call(p_Fp2Dbl_mulOpt2);
2164
2165 // Fp2Dbl::addNC(z.b_, T1, T2);
2166 movq(z.r_, zsave);
2167 movq(x.r_, xsave);
2168 movq(y.r_, ysave);
2169 in_FpDbl_addNC(2, z.b_, T1, T2);
2170
2171 // FpDbl::sub(z.c_.a_, z.c_.a_, z.b_.a_);
2172 in_FpDbl_sub(z.c_.a_, z.c_.a_, z.b_.a_);
2173
2174 // FpDbl::subNC(z.c_.b_, z.c_.b_, z.b_.b_);
2175 in_FpDbl_subNC(z.c_.b_, z.c_.b_, z.b_.b_);
2176
2177 // Fp2Dbl::mul_xi(z.b_, z.c_);
2178 in_Fp2Dbl_mul_xi(z.b_, z.c_);
2179
2180 // Fp2Dbl::add(z.a_, z.b_, T0);
2181 in_Fp2Dbl_add(z.a_, z.b_, T0);
2182
2183 // Fp2::addNC(t0, x.a_, x.b_);
2184 in_Fp2_addNC(t0, x.a_, x.b_);
2185
2186 // Fp2::addNC(t1, y.a_, y.b_);
2187 in_Fp2_addNC(t1, y.a_, y.b_);
2188
2189 // Fp2Dbl::mulOpt2(z.c_, t0, t1);
2190 lea(gp1, ptr [z.c_]);
2191 lea(gp2, ptr [t0]);
2192 lea(gp3, ptr [t1]);
2193 call(p_Fp2Dbl_mulOpt2);
2194
2195 movq(z.r_, zsave);
2196 movq(x.r_, xsave);
2197 movq(y.r_, ysave);
2198
2199 // Fp2Dbl::addNC(z.b_, T0, T1);
2200 in_FpDbl_addNC(2, z.b_, T0, T1);
2201
2202 // FpDbl::sub(z.c_.a_, z.c_.a_, z.b_.a_);
2203 in_FpDbl_sub(z.c_.a_, z.c_.a_, z.b_.a_);
2204
2205 // FpDbl::subNC(z.c_.b_, z.c_.b_, z.b_.b_);
2206 in_FpDbl_subNC(z.c_.b_, z.c_.b_, z.b_.b_);
2207
2208 // FpDbl::subOpt1(z.b_.a_, T2.a_, T2.b_);
2209#ifdef BN_SUPPORT_SNARK
2210 in_Fp2Dbl_mul_xi(z.b_, T2);
2211#else
2212 in_FpDbl_subOpt1(z.b_.a_, T2.a_, T2.b_);
2213
2214 // FpDbl::add(z.b_.b_, T2.a_, T2.b_);
2215 in_FpDbl_add(z.b_.b_, T2.a_, T2.b_);
2216
2217#endif
2218 // Fp2Dbl::add(z.b_, z.b_, z.c_);
2219 in_Fp2Dbl_add(z.b_, z.b_, z.c_);
2220
2221 // Fp2::addNC(t0, x.a_, x.c_);
2222 in_Fp2_addNC(t0, x.a_, x.c_);
2223
2224 // Fp2::addNC(t1, y.a_, y.c_);
2225 in_Fp2_addNC(t1, y.a_, y.c_);
2226
2227 // Fp2Dbl::mulOpt2(z.c_, t0, t1);
2228 lea(gp1, ptr [z.c_]);
2229 lea(gp2, ptr [t0]);
2230 lea(gp3, ptr [t1]);
2231 call(p_Fp2Dbl_mulOpt2);
2232
2233 movq(z.r_, zsave);
2234
2235 // Fp2Dbl::addNC(T2, T2, T0);
2236 in_FpDbl_addNC(2, T2, T2, T0);
2237
2238 // FpDbl::sub(z.c_.a_, z.c_.a_, T2.a_);
2239 in_FpDbl_sub(z.c_.a_, z.c_.a_, T2.a_);
2240
2241 // FpDbl::add(z.c_.a_, z.c_.a_, T1.a_);
2242 in_FpDbl_add(z.c_.a_, z.c_.a_, T1.a_);
2243
2244#if 0
2245 load_sub_rm(gt4, gt3, gt2, gt1, z.c_.b_, T2.b_, false);
2246 load_sub_rm(rdx, rax, gt6, gt5, (RegExp)z.c_.b_ + sizeof(Fp), (RegExp)T2.b_ + sizeof(Fp), true);
2247 add_rm(gt4, gt3, gt2, gt1, T1.b_);
2248 adc_rm(rdx, rax, gt6, gt5, (RegExp)T1.b_ + sizeof(Fp));
2249 store_mr(z.c_.b_, gt4, gt3, gt2, gt1);
2250 store_mr((RegExp)z.c_.b_ + sizeof(Fp), rdx, rax, gt6, gt5);
2251#else
2252 // FpDbl::subNC(z.c_.b_, z.c_.b_, T2.b_);
2253 in_FpDbl_subNC(z.c_.b_, z.c_.b_, T2.b_);
2254
2255 // FpDbl::addNC(z.c_.b_, z.c_.b_, T1.b_);
2256 in_FpDbl_addNC(z.c_.b_, z.c_.b_, T1.b_);
2257#endif
2258 add(rsp, SS);
2259 ret();
2260 }
2261 /*
2262 Fp6::Dbl::mul(Dbl& z, const Fp6T& x, const Fp6T& y);
2263 */
2264 void make_Fp6Dbl_mul()
2265 {
2266 MakeStackFrame<> sf(this, 10);
2267 movq(xm3, gp1);
2268 movq(xm4, gp2);
2269 movq(xm5, gp3);
2270 call(p_Fp6Dbl_mul);
2271 }
2272 /*
2273 (z, x, y) = (gp1, gp2, gp3)
2274 */
2275 void set_p_Fp6_mul()
2276 {
2277 align(16);
2278 p_Fp6_mul = (void*)const_cast<uint8_t*>(getCurr());
2279
2280 const int SS = sizeof(Fp6Dbl);
2281 sub(rsp, SS);
2282 movq(xm2, gp1);
2283 movq(xm3, rsp);
2284 movq(xm4, gp2);
2285 movq(xm5, gp3);
2286 call(p_Fp6Dbl_mul);
2287
2288 for (int i = 0; i < 6; i++) {
2289 movq(gp1, xm2);
2290 if (i == 0) {
2291 mov(gp2, rsp);
2292 } else {
2293 add(gp1, 32 * i);
2294 lea(gp2, ptr [rsp + 64 * i]);
2295 }
2296 call(p_FpDbl_mod);
2297 }
2298 add(rsp, SS);
2299 ret();
2300 }
2301 /*
2302 Fp6::mul(Fp6& z, const Fp6T& x, const Fp6T& y);
2303 */
2304 void make_Fp6_mul()
2305 {
2306 MakeStackFrame<> sf(this, 10);
2307 call(p_Fp6_mul);
2308 }
2309 // for debug
2310 void debug_save_buf(const RegExp& m, int n)
2311 {
2312 static uint64 save[3];
2313
2314 // don't change rsp
2315 push(rcx);
2316 mov(rcx, (size_t)save);
2317 mov(ptr [rcx], rax);
2318 mov(ptr [rcx + 8], rdx);
2319 mov(ptr [rcx + 16], rbx);
2320 pop(rcx);
2321
2322 mov(rdx, (size_t)debug_buf);
2323 lea(rbx, ptr [m]);
2324 for (int i = 0; i < n; i++) {
2325 mov(rax, ptr [rbx + i * 8]);
2326 mov(ptr [rdx + i * 8], rax);
2327 }
2328 push(rcx);
2329 mov(rcx, (size_t)save);
2330 mov(rax, ptr [rcx]);
2331 mov(rdx, ptr [rcx + 8]);
2332 mov(rbx, ptr [rbx + 16]);
2333 pop(rcx);
2334 }
2335 void debug_count_inc()
2336 {
2337 push(rax);
2338 mov(rax, (size_t)&debug_counter);
2339 add(dword [rax], 1);
2340 pop(rax);
2341 }
2342 /*
2343 Compress::square_n(Compress& z, int n);
2344 input : gp1 = pointer to z.z_
2345 */
2346 void make_Compress_square_n()
2347 {
2348 // Fp2 t0, t1, t2;
2349 // Fp2Dbl T0, T1, T2, T3;
2350 const Ext2<Fp> t0(rsp);
2351 const Ext2<Fp> t1(rsp, t0.next);
2352 const Ext2<Fp> t2(rsp, t1.next);
2353 const Ext2<FpDbl> T0(rsp, t2.next);
2354 const Ext2<FpDbl> T1(rsp, T0.next);
2355 const Ext2<FpDbl> T2(rsp, T1.next);
2356 const Ext2<FpDbl> T3(rsp, T2.next);
2357 const int nsave = T3.next;
2358 const int SS = nsave + 8;
2359
2360 MakeStackFrame<> sf(this, 10, SS / 8);
2361 const Xmm& zsave = xm3;
2362 const Reg64& z = gt10;
2363// const int g1 = sizeof(Fp2) * 4;
2364 const int g2 = sizeof(Fp2) * 3;
2365 const int g3 = sizeof(Fp2) * 2;
2366 const int g4 = sizeof(Fp2) * 1;
2367 const int g5 = sizeof(Fp2) * 5;
2368 mov(z, ptr [gp1]);
2369 mov(ptr [rsp + nsave], gp2);
2370 movq(zsave, z);
2371
2372 inLocalLabel();
2373 L(".lp");
2374
2375 // Fp2Dbl::square(T0, z.g4_);
2376 lea(gp1, ptr [T0]);
2377 lea(gp2, ptr [z + g4]);
2378 call(p_Fp2Dbl_square);
2379
2380 // Fp2Dbl::square(T1, z.g5_);
2381 lea(gp1, ptr [T1]);
2382 movq(gp2, zsave);
2383 add(gp2, g5);
2384 call(p_Fp2Dbl_square);
2385
2386 // Fp2Dbl::mul_xi(T2, T1);
2387 in_Fp2Dbl_mul_xi(T2, T1);
2388
2389 // T2 += T0;
2390 in_Fp2Dbl_add(T2, T2, T0);
2391
2392 // Fp2Dbl::mod(t2, T2);
2393 lea(gp1, ptr [t2]);
2394 lea(gp2, ptr [T2]);
2395 in_Fp2Dbl_mod();
2396
2397 // Fp2::add(t0, z.g4_, z.g5_);
2398 movq(z, zsave);
2399 in_Fp2_add(t0, z + g4, z + g5);
2400
2401 // Fp2Dbl::square(T2, t0);
2402 lea(gp1, ptr [T2]);
2403 lea(gp2, ptr [t0]);
2404 call(p_Fp2Dbl_square);
2405
2406// T0 += T1;
2407 // Fp2Dbl::addNC(T0, T0, T1); // QQQ : OK?
2408 movq(z, zsave);
2409 in_FpDbl_add(2, T0, T0, T1);
2410
2411 // T2 -= T0;
2412 in_Fp2Dbl_sub(T2, T2, T0);
2413
2414 // Fp2Dbl::mod(t0, T2);
2415 lea(gp1, ptr [t0]);
2416 lea(gp2, ptr [T2]);
2417 in_Fp2Dbl_mod();
2418
2419 // Fp2::add(t1, z.g2_, z.g3_);
2420 movq(z, zsave);
2421 in_Fp2_add(t1, z + g2, z + g3);
2422
2423 // Fp2Dbl::square(T3, t1);
2424 lea(gp1, ptr [T3]);
2425 lea(gp2, ptr [t1]);
2426 call(p_Fp2Dbl_square);
2427
2428 // Fp2Dbl::square(T2, z.g2_);
2429 movq(z, zsave);
2430 lea(gp1, ptr [T2]);
2431 lea(gp2, ptr [z + g2]);
2432 call(p_Fp2Dbl_square);
2433
2434 // Fp2::mul_xi(t1, t0);
2435 in_Fp2_mul_xi(t1, t0);
2436
2437#if 1
2438 lea(gp1, ptr [z + g2]);
2439 lea(gp2, ptr [t1]);
2440 call(p_Fp2_2z_add_3x);
2441#else
2442 // z.g2_ += t1;
2443 in_Fp2_add(z + g2, z + g2, t1);
2444
2445 // z.g2_ += z.g2_;
2446 in_Fp2_add(z + g2, z + g2, z + g2);
2447
2448 // z.g2_ += t1;
2449 in_Fp2_add(z + g2, z + g2, t1);
2450#endif
2451
2452 // Fp2::sub(t1, t2, z.g3_);
2453 in_Fp2_sub(t1, t2, z + g3);
2454
2455 // t1 += t1;
2456 in_Fp2_add(t1, t1, t1);
2457
2458 // Fp2Dbl::square(T1, z.g3_);
2459 lea(gp1, ptr [T1]);
2460 lea(gp2, ptr [z + g3]);
2461 call(p_Fp2Dbl_square);
2462
2463 // Fp2::add(z.g3_, t1, t2);
2464 movq(z, zsave);
2465 in_Fp2_add(z + g3, t1, t2);
2466
2467 // Fp2Dbl::mul_xi(T0, T1);
2468 in_Fp2Dbl_mul_xi(T0, T1);
2469
2470
2471// T0 += T2;
2472 // Fp2Dbl::addNC(T0, T0, T2); // QQQ : OK?
2473 in_FpDbl_add(2, T0, T0, T2);
2474
2475 // Fp2Dbl::mod(t0, T0);
2476 lea(gp1, ptr [t0]);
2477 lea(gp2, ptr [T0]);
2478 in_Fp2Dbl_mod();
2479
2480#if 1
2481 movq(z, zsave);
2482 for (int i = 0; i < 2; i++) {
2483 mov(rax, (uint64_t)&s_pTbl[1]);
2484 load_add_rm(gt4, gt3, gt2, gt1, (RegExp)t0 + sizeof(Fp) * i, rax, false);
2485 sub_rm(gt4, gt3, gt2, gt1, z + g4 + sizeof(Fp) * i);
2486 add_rr(gt4, gt3, gt2, gt1, gt4, gt3, gt2, gt1);
2487 add_rm(gt4, gt3, gt2, gt1, (RegExp)t0 + sizeof(Fp) * i);
2488 fast_modp(gt4, gt3, gt2, gt1, gp1, gp2, gp3, gt5);
2489 store_mr(z + g4 + sizeof(Fp) * i, gt4, gt3, gt2, gt1);
2490 }
2491#else
2492 // Fp2::sub(z.g4_, t0, z.g4_);
2493 movq(z, zsave);
2494 in_Fp2_sub(z + g4, t0, z + g4);
2495
2496 // z.g4_ += z.g4_;
2497 in_Fp2_add(z + g4, z + g4, z + g4);
2498
2499 // z.g4_ += t0;
2500 in_Fp2_add(z + g4, z + g4, t0);
2501#endif
2502
2503 // Fp2Dbl::addNC(T2, T2, T1);
2504 in_FpDbl_addNC(2, T2, T2, T1);
2505
2506 // T3 -= T2;
2507 in_Fp2Dbl_sub(T3, T3, T2);
2508
2509 // Fp2Dbl::mod(t0, T3);
2510 lea(gp1, ptr [t0]);
2511 lea(gp2, ptr [T3]);
2512 in_Fp2Dbl_mod();
2513
2514 // z.g5_ += t0;
2515 movq(z, zsave);
2516#if 1
2517 lea(gp1, ptr [z + g5]);
2518 lea(gp2, ptr [t0]);
2519 call(p_Fp2_2z_add_3x);
2520#else
2521 in_Fp2_add(z + g5, z + g5, t0);
2522
2523 // z.g5_ += z.g5_;
2524 in_Fp2_add(z + g5, z + g5, z + g5);
2525 // z.g5_ += t0; // # 18
2526 in_Fp2_add(z + g5, z + g5, t0);
2527#endif
2528
2529 sub(qword [rsp + nsave], 1);
2530 jnz(".lp", T_NEAR);
2531 outLocalLabel();
2532 }
2533 /*
2534 input (z, x) = (gp1, gp2)
2535 mz = 2mz + 3mx
2536 destroy : gp3, gt1, .., gt7, rax, rdx
2537 */
2538 void in_Fp_2z_add_3x(const RegExp& mz, const RegExp& mx)
2539 {
2540 load_add_rm(gt4, gt3, gt2, gt1, mz, mx, false);
2541 add_rr(gt4, gt3, gt2, gt1, gt4, gt3, gt2, gt1);
2542 add_rm(gt4, gt3, gt2, gt1, mx);
2543 fast_modp(gt4, gt3, gt2, gt1, gp3, gt5, gt6, gt7);
2544 store_mr(mz, gt4, gt3, gt2, gt1);
2545 }
2546 /*
2547 input (z, x) = (gp1, gp2)
2548 mz = 2mz + 3mx
2549 destroy : gp3, gt1, .., gt7, rax, rdx
2550 */
2551 void set_p_Fp2_2z_add_3x()
2552 {
2553 align(16);
2554 p_Fp2_2z_add_3x = (void*)const_cast<uint8_t*>(getCurr());
2555 in_Fp_2z_add_3x(gp1, gp2);
2556 in_Fp_2z_add_3x(gp1 + sizeof(Fp), gp2 + sizeof(Fp));
2557 ret();
2558 }
2559 void sub_Fp2_mul_gamma_add(const RegExp& mz, const RegExp& mx, const RegExp& my)
2560 {
2561 const int a = 0;
2562 const int b = sizeof(Fp2);
2563 const int c = sizeof(Fp2) * 2;
2564 in_Fp2_mul_xi(mz + a, mx + c);
2565 in_Fp2_add(mz + a, mz + a, my + a);
2566 in_Fp2_add(mz + b, mx + a, my + b);
2567 in_Fp2_add(mz + c, mx + b, my + c);
2568 }
2569 // Fp12::square(Fp12& z);
2570 void make_Fp12_square()
2571 {
2572 // Fp6 t0, t1;
2573 const Ext6<Fp> t0(rsp);
2574 const Ext6<Fp> t1(rsp, t0.next);
2575 const int zsave = t1.next;
2576 const int SS = zsave + 8;
2577 const Ext12<Fp> z(gt10);
2578 MakeStackFrame<> sf(this, 10, SS / 8);
2579
2580 mov(z.r_, gp1);
2581 mov(ptr [rsp + zsave], gp1);
2582 lea(gp1, ptr [t0]);
2583 lea(gp2, ptr [z.a_]);
2584 lea(gp3, ptr [z.b_]);
2585 call(p_Fp6_add);
2586
2587 sub_Fp2_mul_gamma_add(t1, z.b_, z.a_);
2588
2589 lea(gp1, ptr [z.b_]);
2590 mov(gp2, gp1);
2591 mov(gp3, z.r_);
2592 call(p_Fp6_mul);
2593 mov(gp1, ptr [rsp + zsave]);
2594 lea(gp2, ptr [t0]);
2595 lea(gp3, ptr [t1]);
2596 call(p_Fp6_mul);
2597
2598 mov(z.r_, ptr [rsp + zsave]);
2599 sub_Fp2_mul_gamma_add(t1, z.b_, z.b_);
2600
2601 mov(z.r_, ptr [rsp + zsave]);
2602 mov(gp1, z.r_);
2603 mov(gp2, z.r_);
2604 lea(gp3, ptr [t1]);
2605 call(p_Fp6_sub);
2606
2607 lea(gp1, ptr [z.b_]);
2608 mov(gp2, gp1);
2609 mov(gp3, gp1);
2610 call(p_Fp6_add);
2611 }
2612 // Fp12::mul(Fp12& z, const Fp12& x, const Fp12& y);
2613 void make_Fp12_mul()
2614 {
2615 const Ext12<Fp> z(gt8);
2616 const Ext12<Fp> x(gt9);
2617 const Ext12<Fp> y(gt10);
2618
2619 const Ext6<Fp> t0(rsp);
2620 const Ext6<Fp> t1(rsp, t0.next);
2621 const Ext6<FpDbl> T0(rsp, t1.next);
2622 const Ext6<FpDbl> T1(rsp, T0.next);
2623 const Ext6<FpDbl> T2(rsp, T1.next);
2624 const Ext12<FpDbl> zd(rsp, T2.next);
2625 const Ext1<uint64> zsave(rsp, zd.next);
2626 const Ext1<uint64> xsave(rsp, zsave.next);
2627 const Ext1<uint64> ysave(rsp, xsave.next);
2628 const int SS = ysave.next;
2629 MakeStackFrame<> sf(this, 10, SS / 8);
2630 mov(ptr [zsave], gp1);
2631 mov(ptr [xsave], gp2);
2632 mov(ptr [ysave], gp3);
2633
2634 // Fp6Dbl::mul(T0, x.a_, y.a_); // QQQ
2635 lea(gp1, ptr [T0]);
2636 movq(xm3, gp1);
2637 movq(xm4, gp2);
2638 movq(xm5, gp3);
2639 call(p_Fp6Dbl_mul);
2640
2641// Fp6Dbl::mul(T1, x.b_, y.b_);
2642 mov(x.r_, ptr [xsave]);
2643 mov(y.r_, ptr [ysave]);
2644
2645 lea(gp1, ptr [T1]);
2646 lea(gp2, ptr [x.b_]);
2647 lea(gp3, ptr [y.b_]);
2648 movq(xm3, gp1);
2649 movq(xm4, gp2);
2650 movq(xm5, gp3);
2651 call(p_Fp6Dbl_mul);
2652
2653// Fp6::add(t0, x.a_, x.b_);
2654 mov(x.r_, ptr [xsave]);
2655 lea(gp1, ptr [t0]);
2656 lea(gp2, ptr [x.a_]);
2657 lea(gp3, ptr [x.b_]);
2658 call(p_Fp6_add);
2659
2660 // Fp6::add(t1, y.a_, y.b_);
2661 mov(y.r_, ptr [ysave]);
2662 lea(gp1, ptr [t1]);
2663 lea(gp2, ptr [y.a_]);
2664 lea(gp3, ptr [y.b_]);
2665 call(p_Fp6_add);
2666
2667 // Fp6Dbl::mul(zd.a_, t0, t1);
2668 lea(gp1, ptr [zd.a_]);
2669 lea(gp2, ptr [t0]);
2670 lea(gp3, ptr [t1]);
2671 movq(xm3, gp1);
2672 movq(xm4, gp2);
2673 movq(xm5, gp3);
2674 call(p_Fp6Dbl_mul);
2675
2676 // Fp6Dbl::add(T2, T0, T1);
2677 in_FpDbl_add(6, T2, T0, T1);
2678
2679 // Fp6Dbl::sub(zd.b_, zd.a_, T2);
2680 in_FpDbl_sub(6, zd.b_, zd.a_, T2);
2681
2682 // mul_gamma_add<Fp6Dbl, Fp2Dbl>(zd.a_, T1, T0);
2683 in_Fp2Dbl_mul_xi(zd.a_.a_, T1.c_);
2684 in_FpDbl_add(2, zd.a_.a_, zd.a_.a_, T0.a_);
2685 in_FpDbl_add(2, zd.a_.b_, T1.a_, T0.b_);
2686 in_FpDbl_add(2, zd.a_.c_, T1.b_, T0.c_);
2687
2688 // Dbl::mod(z, zd);
2689 for (int i = 0; i < 12; i++) {
2690 mov(gp1, ptr [zsave]);
2691 if (i > 0) add(gp1, sizeof(Fp) * i);
2692 lea(gp2, ptr [(RegExp)zd + sizeof(FpDbl) * i]);
2693 call(p_FpDbl_mod);
2694 }
2695 }
2696 // static void mul_Fp2_024C(Fp12T &z, const Fp6& x)
2697 void make_Fp12Dbl_mul_Fp2_024()
2698 {
2699 // Fp2 t0, t1, t2, t4;
2700 // Fp2Dbl T2, T3;
2701 // Fp2Dbl X0T0, X2T2, X4T4;
2702 // Fp2Dbl ACC;
2703 const Ext2<Fp> t0(rsp);
2704 const Ext2<Fp> t1(rsp, t0.next);
2705 const Ext2<Fp> t2(rsp, t1.next);
2706 const Ext2<Fp> t4(rsp, t2.next);
2707 const Ext2<FpDbl> T2(rsp, t4.next);
2708 const Ext2<FpDbl> T3(rsp, T2.next);
2709 const Ext2<FpDbl> X0T0(rsp, T3.next);
2710 const Ext2<FpDbl> X2T2(rsp, X0T0.next);
2711 const Ext2<FpDbl> X4T4(rsp, X2T2.next);
2712 const Ext2<FpDbl> ACC(rsp, X4T4.next);
2713 const int SS = ACC.next;
2714 const Ext12<Fp> z(gt9);
2715 const Ext6<Fp> x(gt10);
2716
2717 MakeStackFrame<> sf(this, 10, SS / 8);
2718 const Xmm& zsave = xm3;
2719 const Xmm& xsave = xm4;
2720 mov(z.r_, gp1);
2721 mov(x.r_, gp2);
2722 movq(zsave, z.r_);
2723 movq(xsave, x.r_);
2724
2725 // Fp2Dbl::mulOpt2(X0T0, z.a_.a_, x.a_);
2726 lea(gp1, ptr [X0T0]);
2727 mov(gp2, z.r_);
2728 mov(gp3, x.r_);
2729 call(p_Fp2Dbl_mulOpt2);
2730
2731 // Fp2Dbl::mulOpt2(X2T2, z.a_.c_, x.c_);
2732 movq(z.r_, zsave);
2733 movq(x.r_, xsave);
2734 lea(gp1, ptr [X2T2]);
2735 lea(gp2, ptr [z.a_.c_]);
2736 lea(gp3, ptr [x.c_]);
2737 call(p_Fp2Dbl_mulOpt2);
2738
2739 // Fp2Dbl::mulOpt2(X4T4, z.b_.b_, x.b_);
2740 movq(z.r_, zsave);
2741 movq(x.r_, xsave);
2742 lea(gp1, ptr [X4T4]);
2743 lea(gp2, ptr [z.b_.b_]);
2744 lea(gp3, ptr [x.b_]);
2745 call(p_Fp2Dbl_mulOpt2);
2746
2747 // Fp2::add(t2, z.a_.a_, z.b_.b_);
2748 movq(z.r_, zsave);
2749 in_Fp2_add(t2, z.a_.a_, z.b_.b_);
2750
2751 // Fp2::add(t1, z.a_.a_, z.a_.c_);
2752 in_Fp2_add(t1, z.a_.a_, z.a_.c_);
2753
2754 // Fp2::add(t4, z.a_.b_, z.b_.a_);
2755 in_Fp2_add(t4, z.a_.b_, z.b_.a_);
2756
2757 // t4 += z.b_.c_;
2758 in_Fp2_add(t4, t4, z.b_.c_);
2759
2760 // Fp2Dbl::mulOpt2(ACC, z.a_.b_, x.c_);
2761 movq(x.r_, xsave);
2762 lea(gp1, ptr [ACC]);
2763 lea(gp2, ptr [z.a_.b_]);
2764 lea(gp3, ptr [x.c_]);
2765 call(p_Fp2Dbl_mulOpt2);
2766
2767 // Fp2Dbl::add(T2, ACC, X4T4);
2768 in_Fp2Dbl_add(T2, ACC, X4T4);
2769
2770 //Fp2Dbl::mul_xi(T3, T2);
2771 in_Fp2Dbl_mul_xi(T3, T2);
2772
2773 // T3 += X0T0;
2774 in_Fp2Dbl_add(T3, T3, X0T0);
2775
2776 // Fp2Dbl::mod(z.a_.a_, T3);
2777 movq(z.r_, zsave);
2778 lea(gp1, ptr [z.a_.a_]);
2779 lea(gp2, ptr [T3]);
2780 in_Fp2Dbl_mod();
2781
2782 // Fp2Dbl::mulOpt2(T2, z.b_.c_, x.b_);
2783 movq(z.r_, zsave);
2784 movq(x.r_, xsave);
2785 lea(gp1, ptr [T2]);
2786 lea(gp2, ptr [z.b_.c_]);
2787 lea(gp3, ptr [x.b_]);
2788 call(p_Fp2Dbl_mulOpt2);
2789
2790 // ACC += T2;
2791 in_Fp2Dbl_add(ACC, ACC, T2);
2792
2793 // T2 += X2T2;
2794 in_Fp2Dbl_add(T2, T2, X2T2);
2795
2796 // Fp2Dbl::mul_xi(T3, T2);
2797 in_Fp2Dbl_mul_xi(T3, T2);
2798
2799 // Fp2Dbl::mulOpt2(T2, z.a_.b_, x.a_);
2800 movq(z.r_, zsave);
2801 movq(x.r_, xsave);
2802 lea(gp1, ptr [T2]);
2803 lea(gp2, ptr [z.a_.b_]);
2804 lea(gp3, ptr [x.a_]);
2805 call(p_Fp2Dbl_mulOpt2);
2806
2807 // ACC += T2;
2808 in_Fp2Dbl_add(ACC, ACC, T2);
2809
2810 // T3 += T2;
2811 in_Fp2Dbl_add(T3, T3, T2);
2812
2813 // Fp2Dbl::mod(z.a_.b_, T3);
2814 movq(z.r_, zsave);
2815 lea(gp1, ptr [z.a_.b_]);
2816 lea(gp2, ptr [T3]);
2817 in_Fp2Dbl_mod();
2818
2819 // Fp2::add(t0, x.a_, x.c_);
2820 movq(x.r_, xsave);
2821 in_Fp2_add(t0, x.a_, x.c_);
2822
2823 // Fp2Dbl::mulOpt2(T2, t1, t0);
2824 lea(gp1, ptr [T2]);
2825 lea(gp2, ptr [t1]);
2826 lea(gp3, ptr [t0]);
2827 call(p_Fp2Dbl_mulOpt2);
2828
2829 // T2 -= X0T0;
2830 in_Fp2Dbl_sub(T2, T2, X0T0);
2831
2832 // T2 -= X2T2;
2833 in_Fp2Dbl_sub(T2, T2, X2T2);
2834
2835 // Fp2Dbl::mulOpt2(T3, z.b_.a_, x.b_);
2836 movq(z.r_, zsave);
2837 movq(x.r_, xsave);
2838 lea(gp1, ptr [T3]);
2839 lea(gp2, ptr [z.b_.a_]);
2840 lea(gp3, ptr [x.b_]);
2841 call(p_Fp2Dbl_mulOpt2);
2842
2843 // ACC += T3;
2844 in_Fp2Dbl_add(ACC, ACC, T3);
2845
2846 // T2 += T3;
2847 in_Fp2Dbl_add(T2, T2, T3);
2848
2849 // Fp2::add(t0, z.a_.c_, z.b_.b_);
2850 movq(z.r_, zsave);
2851 in_Fp2_add(t0, z.a_.c_, z.b_.b_);
2852
2853 // Fp2Dbl::mod(z.a_.c_, T2);
2854 lea(gp1, ptr [z.a_.c_]);
2855 lea(gp2, ptr [T2]);
2856 in_Fp2Dbl_mod();
2857
2858 movq(x.r_, xsave);
2859 // Fp2::add(t1, x.c_, x.b_);
2860 in_Fp2_add(t1, x.c_, x.b_);
2861
2862 // Fp2Dbl::mulOpt2(T2, t0, t1);
2863 lea(gp1, ptr [T2]);
2864 lea(gp2, ptr [t0]);
2865 lea(gp3, ptr [t1]);
2866 call(p_Fp2Dbl_mulOpt2);
2867
2868
2869 // T2 -= X2T2;
2870 in_Fp2Dbl_sub(T2, T2, X2T2);
2871
2872 // T2 -= X4T4;
2873 in_Fp2Dbl_sub(T2, T2, X4T4);
2874
2875 // Fp2Dbl::mul_xi(T3, T2);
2876 in_Fp2Dbl_mul_xi(T3, T2);
2877
2878 // Fp2Dbl::mulOpt2(T2, z.b_.a_, x.a_);
2879 movq(z.r_, zsave);
2880 movq(x.r_, xsave);
2881 lea(gp1, ptr [T2]);
2882 lea(gp2, ptr [z.b_.a_]);
2883 mov(gp3, x.r_);
2884 call(p_Fp2Dbl_mulOpt2);
2885
2886 // ACC += T2;
2887 in_Fp2Dbl_add(ACC, ACC, T2);
2888
2889 // T3 += T2;
2890 in_Fp2Dbl_add(T3, T3, T2);
2891// in_FpDbl_addNC(2, T3, T3, T2); // RRR?
2892
2893 // Fp2Dbl::mod(z.b_.a_, T3);
2894 movq(z.r_, zsave);
2895 lea(gp1, ptr [z.b_.a_]);
2896 lea(gp2, ptr [T3]);
2897 in_Fp2Dbl_mod();
2898
2899 // Fp2Dbl::mulOpt2(T2, z.b_.c_, x.c_);
2900 movq(z.r_, zsave);
2901 movq(x.r_, xsave);
2902 lea(gp1, ptr [T2]);
2903 lea(gp2, ptr [z.b_.c_]);
2904 lea(gp3, ptr [x.c_]);
2905 call(p_Fp2Dbl_mulOpt2);
2906
2907 // ACC += T2;
2908 in_Fp2Dbl_add(ACC, ACC, T2);
2909
2910 // Fp2Dbl::mul_xi(T3, T2);
2911 in_Fp2Dbl_mul_xi(T3, T2);
2912
2913 // Fp2::add(t0, x.a_, x.b_);
2914 movq(x.r_, xsave);
2915 in_Fp2_add(t0, x.a_, x.b_);
2916
2917 // Fp2Dbl::mulOpt2(T2, t2, t0);
2918 lea(gp1, ptr [T2]);
2919 lea(gp2, ptr [t2]);
2920 lea(gp3, ptr [t0]);
2921 call(p_Fp2Dbl_mulOpt2);
2922
2923 // T2 -= X0T0;
2924 in_Fp2Dbl_sub(T2, T2, X0T0);
2925
2926 // T2 -= X4T4;
2927 in_Fp2Dbl_sub(T2, T2, X4T4);
2928
2929 // T3 += T2;
2930 in_Fp2Dbl_add(T3, T3, T2);
2931
2932 // Fp2Dbl::mod(z.b_.b_, T3);
2933 movq(z.r_, zsave);
2934 lea(gp1, ptr [z.b_.b_]);
2935 lea(gp2, ptr [T3]);
2936 in_Fp2Dbl_mod();
2937
2938 // Fp2::add(t0, x.a_, x.c_);
2939 movq(x.r_, xsave);
2940 in_Fp2_add(t0, x.a_, x.c_);
2941
2942 // t0 += x.b_;
2943 in_Fp2_add(t0, t0, x.b_);
2944
2945 // Fp2Dbl::mulOpt2(T2, t4, t0);
2946 lea(gp1, ptr [T2]);
2947 lea(gp2, ptr [t4]);
2948 lea(gp3, ptr [t0]);
2949 call(p_Fp2Dbl_mulOpt2);
2950
2951 // T2 -= ACC;
2952 movq(z.r_, zsave);
2953 in_Fp2Dbl_sub(T2, T2, ACC);
2954
2955 // Fp2Dbl::mod(z.b_.c_, T2);
2956 lea(gp1, ptr [z.b_.c_]);
2957 lea(gp2, ptr [T2]);
2958 in_Fp2Dbl_mod();
2959 }
2960
2961 /*
2962 input (pz, px) = (gp1, gp2)
2963 use gt1, .., gt9
2964 */
2965 void set_p_Fp2Dbl_square()
2966 {
2967 align(16);
2968 p_Fp2Dbl_square = (void*)const_cast<uint8_t*>(getCurr());
2969
2970 const Ext2<FpDbl> z(gp1);
2971 const Ext2<Fp> x(gp2);
2972 // Fp t0, t1
2973 const Ext1<Fp> t0(rsp);
2974 const Ext1<Fp> t1(rsp, t0.next);
2975 const int SS = t1.next;
2976
2977 const Reg64& gt0 = gp3;
2978 const Reg64& a = rax;
2979
2980 sub(rsp, SS);
2981
2982 load_rm(gt3, gt2, gt1, gt0, x.b_);
2983 add_rr(gt3, gt2, gt1, gt0, gt3, gt2, gt1, gt0);
2984 store_mr(t0, gt3, gt2, gt1, gt0);
2985
2986 mul4x4(z.b_, t0, x.a_, gt9, gt8, gt7, gt6, gt5, gt4, gt3, gt2, gt1, gt0);
2987 // d0 = t[3..0] * a_
2988
2989 mov(a, (uint64_t)&s_pTbl[1]);
2990
2991 load_add_rm(gt3, gt2, gt1, gt0, x.a_, a, false); // t = a + p
2992 sub_rm(gt3, gt2, gt1, gt0, x.b_); // a + p - b
2993 store_mr(t1, gt3, gt2, gt1, gt0); // t = a + p - b
2994
2995 // Fp::addNC(t0, x.a_, x.b_);
2996 in_Fp_addNC(t0, x.a_, x.b_);
2997 // FpDbl::mul(z.a_, t0, t1);
2998 mul4x4(z, t0, t1, gt9, gt8, gt7, gt6, gt5, gt4, gt3, gt2, gt1, gt0);
2999 add(rsp, SS);
3000 ret();
3001 }
3002 /*
3003 square(Dbl& z, const Fp2& x)
3004 z = x * x
3005 */
3006 void make_Fp2Dbl_square()
3007 {
3008 MakeStackFrame<> sf(this, 9);
3009 call(p_Fp2Dbl_square);
3010 }
3011 // void pointDblLineEval(Fp6T<Fp2>& l, Fp2 *R, const typename Fp2::Fp *P);
3012 void make_pointDblLineEval(bool withoutP)
3013 {
3014 // Fp2 t0, t1, t2, t3, t4, t5;
3015 // Fp2Dbl T0, T1, T2;
3016 const Ext2<Fp> t0(rsp);
3017 const Ext2<Fp> t1(rsp, t0.next);
3018 const Ext2<Fp> t2(rsp, t1.next);
3019 const Ext2<Fp> t3(rsp, t2.next);
3020 const Ext2<Fp> t4(rsp, t3.next);
3021 const Ext2<Fp> t5(rsp, t4.next);
3022 const Ext2<FpDbl> T0(rsp, t5.next);
3023 const Ext2<FpDbl> T1(rsp, T0.next);
3024 const Ext2<FpDbl> T2(rsp, T1.next);
3025 const int SS = T2.next;
3026 const Ext6<Fp> l(gt8);
3027 const Reg64& R = gt9;
3028 const Reg64& P = gt10;
3029
3030 MakeStackFrame<> sf(this, 10, SS / 8);
3031 const Xmm& lsave = xm3;
3032 const Xmm& Rsave = xm4;
3033 const Xmm& Psave = xm5;
3034 mov(l.r_, gp1);
3035 mov(R, gp2);
3036 movq(lsave, gp1);
3037 movq(Rsave, gp2);
3038 movq(Psave, gp3);
3039
3040 // Fp2::square(t0, R[2]);
3041 lea(gp1, ptr [t0]);
3042 lea(gp2, ptr [R + sizeof(Fp2) * 2]);
3043 call(p_Fp2_square);
3044
3045 // Fp2::mul(t4, R[0], R[1]);
3046 movq(R, Rsave);
3047 lea(gp1, ptr [t4]);
3048 lea(gp2, ptr [R + sizeof(Fp2) * 0]);
3049 lea(gp3, ptr [R + sizeof(Fp2) * 1]);
3050 call(p_Fp2_mul);
3051
3052 // Fp2::square(t1, R[1]);
3053 movq(R, Rsave);
3054 lea(gp1, ptr [t1]);
3055 lea(gp2, ptr [R + sizeof(Fp2) * 1]);
3056 call(p_Fp2_square);
3057
3058 // Fp2::add(t3, t0, t0);
3059 in_Fp2_add(t3, t0, t0);
3060
3061 // Fp2::divBy2(t4, t4);
3062 lea(gp1, ptr [t4]);
3063 mov(gp2, gp1);
3064 call(p_Fp2_divBy2);
3065
3066 // Fp2::add(t5, t0, t1);
3067 in_Fp2_add(t5, t0, t1);
3068
3069 // t0 += t3;
3070 in_Fp2_add(t0, t0, t3);
3071
3072#ifdef BN_SUPPORT_SNARK
3073 // (a + bu) * binv_xi
3074 if (ParamT<Fp2>::b == 82) {
3075 // (a + bu) * (9 - u) = (9a + b) + (9b - a)u
3076 in_Fp_mul_xi_addsub(t2, t0, t0 + 32, true);
3077 in_Fp_mul_xi_addsub(t2 + 32, t0 + 32, t0, false);
3078 } else {
3079 lea(gp1, ptr [t2]);
3080 lea(gp2, ptr [t0]);
3081 mov(gp3, (size_t)&ParamT<Fp2>::b_invxi);
3082 call(p_Fp2_mul);
3083 }
3084#else
3085 // Fp::add(t2.a_, t0.a_, t0.b_);
3086 in_Fp_add(t2.a_, t0.a_, t0.b_);
3087
3088 // Fp::sub(t2.b_, t0.b_, t0.a_);
3089 in_Fp_sub(t2.b_, t0.b_, t0.a_);
3090#endif
3091
3092 // Fp2::square(t0, R[0]);
3093 lea(gp1, ptr [t0]);
3094 movq(gp2, Rsave);
3095 call(p_Fp2_square);
3096
3097 // Fp2::add(t3, t2, t2);
3098 in_Fp2_add(t3, t2, t2);
3099
3100 // t3 += t2;
3101 in_Fp2_add(t3, t3, t2);
3102
3103 // Fp2::addNC(l.c_, t0, t0);
3104 movq(l.r_, lsave);
3105 in_Fp2_addNC(l.c_, t0, t0);
3106
3107 // Fp2::sub(R[0], t1, t3);
3108 movq(R, Rsave);
3109 in_Fp2_sub(R, t1, t3);
3110
3111 // Fp2::addNC(l.c_, l.c_, t0);
3112 in_Fp2_addNC(l.c_, l.c_, t0);
3113
3114 // t3 += t1;
3115 in_Fp2_add(t3, t3, t1);
3116
3117 // R[0] *= t4;
3118 mov(gp1, R);
3119 mov(gp2, gp1);
3120 lea(gp3, ptr [t4]);
3121 call(p_Fp2_mul);
3122
3123 // Fp2::divBy2(t3, t3);
3124 lea(gp1, ptr [t3]);
3125 mov(gp2, gp1);
3126 call(p_Fp2_divBy2);
3127
3128 // Fp2Dbl::square(T0, t3);
3129 lea(gp1, ptr [T0]);
3130 lea(gp2, ptr [t3]);
3131 call(p_Fp2Dbl_square);
3132
3133 // Fp2Dbl::square(T1, t2);
3134 lea(gp1, ptr [T1]);
3135 lea(gp2, ptr [t2]);
3136 call(p_Fp2Dbl_square);
3137
3138 // Fp2Dbl::addNC(T2, T1, T1);
3139 in_FpDbl_addNC(2, T2, T1, T1);
3140
3141 // Fp2::add(t3, R[1], R[2]);
3142 movq(R, Rsave);
3143 in_Fp2_add(t3, R + sizeof(Fp2) * 1, R + sizeof(Fp2) * 2);
3144
3145 // Fp2Dbl::addNC(T2, T2, T1);
3146#ifdef BN_SUPPORT_SNARK
3147 in_FpDbl_add(2, T2, T2, T1);
3148#else
3149 in_FpDbl_addNC(2, T2, T2, T1);
3150#endif
3151
3152 // Fp2::square(t3, t3);
3153 lea(gp1, ptr [t3]);
3154 mov(gp2, gp1);
3155 call(p_Fp2_square);
3156
3157 // t3 -= t5;
3158 in_Fp2_sub(t3, t3, t5);
3159
3160 // T0 -= T2;
3161 in_FpDbl_sub(2, T0, T0, T2);
3162
3163 // Fp2Dbl::mod(R[1], T0);
3164 movq(R, Rsave);
3165 lea(gp1, ptr [R + sizeof(Fp2) * 1]);
3166 lea(gp2, ptr [T0]);
3167 in_Fp2Dbl_mod();
3168
3169 // Fp2::mul(R[2], t1, t3);
3170 movq(R, Rsave);
3171 lea(gp1, ptr [R + sizeof(Fp2) * 2]);
3172 lea(gp2, ptr [t1]);
3173 lea(gp3, ptr [t3]);
3174 call(p_Fp2_mul);
3175
3176 // t2 -= t1;
3177 in_Fp2_sub(t2, t2, t1);
3178
3179 // Fp2::mul_xi(l.a_, t2);
3180 movq(l.r_, lsave);
3181 in_Fp2_mul_xi(l, t2);
3182
3183 // Fp2::neg(t3, t3);
3184 movq(l.r_, lsave);
3185 in_Fp2_neg(l.b_, t3);
3186 if (withoutP) return;
3187
3188 // Fp2::mul_Fp_0(l.c_, l.c_, P[0]);
3189 lea(gp1, ptr [l.c_]);
3190 mov(gp2, gp1);
3191 movq(gp3, Psave);
3192 call(p_Fp_mul);
3193 movq(l.r_, lsave);
3194 lea(gp1, ptr [l.c_.b_]);
3195 mov(gp2, gp1);
3196 movq(gp3, Psave);
3197 call(p_Fp_mul);
3198
3199 // # 17
3200 // Fp2::mul_Fp_0(l.b_, l.b_, P[1]);
3201 movq(l.r_, lsave);
3202 movq(P, Psave);
3203 lea(gp1, ptr [l.b_]);
3204 mov(gp2, gp1);
3205 lea(gp3, ptr [P + sizeof(Fp) * 1]);
3206 call(p_Fp_mul);
3207 movq(l.r_, lsave);
3208 movq(P, Psave);
3209 lea(gp1, ptr [l.b_.b_]);
3210 mov(gp2, gp1);
3211 lea(gp3, ptr [P + sizeof(Fp) * 1]);
3212 call(p_Fp_mul);
3213
3214 }
3215
3216 PairingCode(size_t size, void *userPtr)
3217 : Xbyak::CodeGenerator(size, userPtr)
3218 , pp_(0)
3219 , gtn_(0)
3220 , a(rax)
3221 , d(rdx)
3222#ifdef _WIN32
3223 , gp1(rcx)
3224 , gp2(r9) // rdx => r9
3225 , gp3(r8)
3226 , gt1(r10)
3227 , gt2(r11)
3228 , gt3(rdi) // must be saved if used
3229 , gt4(rsi)
3230#else
3231 , gp1(rdi)
3232 , gp2(rsi)
3233 , gp3(r9) // rdx => r9
3234 , gt1(r8)
3235 , gt2(rcx)
3236 , gt3(r10)
3237 , gt4(r11)
3238#endif
3239 , gt5(r12) // must be saved if used
3240 , gt6(r13)
3241 , gt7(r14)
3242 , gt8(r15)
3243 , gt9(rbp)
3244 , gt10(rbx)
3245 {
3246 }
3247 /*
3248 utility function for many register
3249 you can use gt1, ..., gtn and rax, rdx
3250 gp0 : 1st parameter
3251 gp1 : 2nd parameter
3252 gp2 : 3rd parameter
3253 gtn : max gtn
3254 numQword : alloca stack if necessary
3255 rsp[0..8 * numQrod - 1] are available
3256 */
3257 int storeReg(int gtn, int numQword = 0)
3258 {
3259 const Reg64 tbl[] = {
3260 gt3, gt4, gt5, gt6, gt7, gt8, gt9, gt10
3261 };
3262 assert(0 <= gtn && gtn <= 10);
3263 gtn_ = gtn;
3264#ifdef _WIN32
3265 const int P = 8 * (std::max(0, gtn - 6) + numQword);
3266 if (P > 0) sub(rsp, P);
3267 for (int i = 3; i <= std::min(gtn, 6); i++) {
3268 mov(ptr [rsp + P + (i - 2) * 8], tbl[i - 3]);
3269 }
3270 for (int i = 7; i <= gtn; i++) {
3271 mov(ptr [rsp + P - 8 * (i - 6)], tbl[i - 3]);
3272 }
3273#else
3274 const int P = 8 * (std::max(0, gtn - 4) + numQword);
3275 if (P > 0) sub(rsp, P);
3276 for (int i = 5; i <= gtn; i++) {
3277 mov(ptr [rsp + P - 8 * (i - 4)], tbl[i - 3]);
3278 }
3279#endif
3280 mov(r9, rdx);
3281 return P;
3282 }
3283 /*
3284 specify P as the return value of storeReg
3285 */
3286 void restoreReg(int P)
3287 {
3288 const Reg64 tbl[] = {
3289 gt3, gt4, gt5, gt6, gt7, gt8, gt9, gt10
3290 };
3291 assert(0 <= gtn_ && gtn_ <= 10);
3292#ifdef _WIN32
3293 for (int i = 3; i <= std::min(gtn_, 6); i++) {
3294 mov(tbl[i - 3], ptr [rsp + P + (i - 2) * 8]);
3295 }
3296 for (int i = 7; i <= gtn_; i++) {
3297 mov(tbl[i - 3], ptr [rsp + P - 8 * (i - 6)]);
3298 }
3299#else
3300 for (int i = 5; i <= gtn_; i++) {
3301 mov(tbl[i - 3], ptr [rsp + P - 8 * (i - 4)]);
3302 }
3303#endif
3304 if (P > 0) add(rsp, P);
3305 }
3306 void init(const mie::Vuint& p, int mode, bool useMulx)
3307 {
3308 detectCpu(mode, useMulx);
3309
3310 // make some parameters for mulmod and Fp_mul
3311 const size_t N = 64;
3313 Z::setModulo(mie::Vuint(1) << N);
3314 Z x(p);
3315 x = -x;
3316 x.inverse();
3317 pp_ = x[0];
3318
3319 // generate code
3320 set_p_Fp_mul();
3321 set_p_Fp2_neg();
3322 set_p_Fp2_add();
3323 set_p_Fp2_sub();
3324 set_p_Fp2_addNC();
3325 set_p_FpDbl_mod();
3326 set_p_Fp2_square();
3327 set_p_Fp2_mul();
3328 set_p_Fp2_divBy2();
3329 set_p_Fp2_2z_add_3x();
3330 set_p_FpDbl_add();
3331 set_p_FpDbl_sub();
3332 set_p_FpDbl_addNC();
3333 set_p_FpDbl_subNC();
3334 set_p_Fp2Dbl_mul_xi();
3335 set_p_Fp2Dbl_mulOpt(1);
3336 set_p_Fp2Dbl_mulOpt(2);
3337 set_p_Fp2Dbl_square();
3338 set_p_Fp6_add();
3339 set_p_Fp6_sub();
3340 set_p_Fp6Dbl_mul();
3341 set_p_Fp6_mul();
3342
3343 // Fp
3344 typedef void (*opFpx2)(Fp&, const Fp&);
3345 typedef void (*opFpx3)(Fp&, const Fp&, const Fp&);
3346
3348 make_Fp_add(1);
3349
3350 align(16);
3352 make_Fp_sub(1);
3353
3354 align(16);
3356 make_Fp_addNC(1);
3357
3358 align(16);
3360 make_Fp_subNC();
3361
3362 align(16);
3364 make_Fp_neg();
3365
3366 align(16);
3368 make_Fp_shr(1);
3369
3370 align(16);
3372 make_Fp_shr(2);
3373 align(16);
3375 make_Fp_mul();
3376
3377 align(16);
3379 make_Fp_preInv();
3380
3381 // setup FpDbl
3382
3383 align(16);
3385 make_FpDbl_add(1);
3386
3387 align(16);
3389 make_FpDbl_addNC(1);
3390
3391 align(16);
3393 make_FpDbl_neg();
3394
3395 align(16);
3397 make_FpDbl_sub(1);
3398
3399 align(16);
3401 make_FpDbl_subNC(1);
3402
3403 align(16);
3405 make_FpDbl_mul();
3406
3407 align(16);
3408 Fp::Dbl::mod = getCurr<void (*)(Fp &, const FpDbl &)>(); // QQQ
3409 make_FpDbl_mod();
3410
3411 // setup Fp2
3412 typedef void (*opFp2x2)(Fp2&, const Fp2&);
3413 typedef void (*opFp2x3)(Fp2&, const Fp2&, const Fp2&);
3414
3415 align(16);
3417 make_Fp_add(2);
3418
3419 align(16);
3421 make_Fp_addNC(2);
3422
3423 align(16);
3425 make_Fp_sub(2);
3426
3427 align(16);
3429 make_Fp2_mul();
3430
3431 align(16);
3433 make_Fp2_mul_xi();
3434
3435 align(16);
3437 make_Fp2_square();
3438
3439 align(16);
3441 make_Fp2_mul_Fp_0();
3442
3443 align(16);
3445 make_Fp2_divBy2();
3446
3447 // setup Fp2::Dbl
3448
3449 align(16);
3451 make_FpDbl_add(2);
3452
3453 align(16);
3455 make_FpDbl_addNC(2);
3456
3457 align(16);
3459 make_Fp2Dbl_neg();
3460
3461 align(16);
3463 make_FpDbl_sub(2);
3464
3465 align(16);
3467 make_FpDbl_subNC(2);
3468
3469 align(16);
3471 make_Fp2Dbl_mulOpt(1);
3472
3473 align(16);
3475 make_Fp2Dbl_mulOpt(2);
3476
3477 align(16);
3479 make_Fp2Dbl_square();
3480
3481 align(16);
3483 make_Fp2Dbl_mod();
3484
3485 align(16);
3487 make_Fp2Dbl_mul_xi();
3488
3489 // setup Fp6
3490 typedef void (*opFp6x3)(Fp6&, const Fp6&, const Fp6&);
3491
3492 align(16);
3494 make_Fp6_add();
3495
3496 align(16);
3498 make_Fp6_sub();
3499
3500 align(16);
3502 make_pointDblLineEval(false);
3503 align(16);
3505 make_pointDblLineEval(true);
3506
3507 align(16);
3509 make_Fp6Dbl_mul();
3510
3511 align(16);
3513 make_Fp6_mul();
3514
3515 align(16);
3517 make_Compress_square_n();
3518
3519 align(16);
3521 make_Fp12_square();
3522
3523 align(16);
3525 make_Fp12_mul();
3526
3527 align(16);
3529 make_Fp12Dbl_mul_Fp2_024();
3530
3531// printf("jit code size=%d\n", (int)getSize());
3532 }
3533 bool isRaxP_; // true if rax is set to a pointer to p
3534 uint64_t pp_; // for Fp_mul
3535 void *p_Fp_mul;
3536 void *p_Fp2_neg;
3537 void *p_Fp2_add;
3538 void *p_Fp2_sub;
3539 void *p_Fp2_square;
3540 void *p_Fp2_mul;
3541 void *p_Fp2_divBy2;
3542 void *p_Fp2_addNC;
3543 void *p_Fp2_2z_add_3x;
3544 void *p_FpDbl_add;
3545 void *p_FpDbl_sub;
3546 void *p_FpDbl_addNC;
3547 void *p_FpDbl_subNC;
3548 void *p_FpDbl_mod;
3549 void *p_Fp2Dbl_mulOpt1;
3550 void *p_Fp2Dbl_mulOpt2;
3551 void *p_Fp2Dbl_square;
3552 void *p_Fp2Dbl_mul_xi;
3553 void *p_Fp6_add;
3554 void *p_Fp6_sub;
3555 void *p_Fp6_mul;
3556 void *p_Fp6Dbl_mul;
3557 int gtn_;
3558 const Reg64& a;
3559 const Reg64& d;
3560 const Reg64& gp1;
3561 const Reg64& gp2;
3562 const Reg64& gp3;
3563 const Reg64& gt1;
3564 const Reg64& gt2;
3565 const Reg64& gt3;
3566 const Reg64& gt4;
3567 const Reg64& gt5;
3568 const Reg64& gt6;
3569 const Reg64& gt7;
3570 const Reg64& gt8;
3571 const Reg64& gt9;
3572 const Reg64& gt10;
3573};
3574#endif // MIE_USE_X64ASM
3575
3576void Fp::setTablesForDiv(const mie::Vuint& p)
3577{
3578 // for divBy2
3579 assert((p[0] & 0x1) == 1);
3580 halfTbl_[0].clear();
3581 Fp::setDirect(halfTbl_[1], (p+1)>>1);
3582
3583 // for divBy4
3584 assert((p[0] & 0x3) == 3);
3585 quarterTbl_[0].clear();
3586 mie::Vuint quarter = (p+1)>>2;
3587 Fp::setDirect(quarterTbl_[1], quarter);
3588 Fp::setDirect(quarterTbl_[2], quarter*2);
3589 Fp::setDirect(quarterTbl_[3], quarter*3);
3590}
3591
3592void Fp::setModulo(const mie::Vuint& p, int mode, bool useMulx, bool definedBN_SUPPORT_SNARK)
3593{
3594#ifdef DEBUG_COUNT
3595 puts("DEBUG_COUNT mode on!!!");
3596#endif
3597#ifdef BN_SUPPORT_SNARK
3598 const bool scipr = true;
3599#else
3600 const bool scipr = false;
3601#endif
3602 if (scipr != definedBN_SUPPORT_SNARK) {
3603 fprintf(stderr, "use -DBN_SUPPORT_SNARK for all sources\n");
3604 exit(1);
3605 }
3606 static bool init = false;
3607 if (init) return;
3608 init = true;
3609 if (p.size() != Fp::N) {
3610 mie::local::errExit("not support p for Fp::setModulo");
3611 }
3612 p_ = p;
3613
3614 // Fp_mul
3615 {
3617 ZN::setModulo(Vuint(1) << (sizeof(Unit) * 8));
3618 ZN t(p);
3619 t = -t;
3620 t.inverse();
3621 pp_mont = t[0];
3622 pN = p << 256;
3623 p_add1_div4_ = (p + 1) / 4;
3624 }
3625
3626 // we can't use Fp before setting Fp_mul* variables!!!
3627 montgomeryR_ = (Vuint(1) << 256) % p;
3628 {
3630 Z::setModulo(p);
3631 Z t(montgomeryR_);
3632 Fp::setDirect(montgomeryR2_, t * t);
3633 }
3634 one_.clear();
3635 one_[0] = 1;
3636
3638 try {
3639 // setup code and data area
3640 const int PageSize = 4096;
3641 const size_t codeSize = PageSize * 9;
3642 const size_t dataSize = PageSize * 1;
3643
3644 static std::vector<Xbyak::uint8> buf;
3645 buf.resize(codeSize + dataSize + PageSize);
3646 Xbyak::uint8 *const codeAddr = Xbyak::CodeArray::getAlignedAddress(&buf[0], PageSize);
3647 Xbyak::CodeArray::protect(codeAddr, codeSize, true);
3648 s_data = Xbyak::CastTo<Data*>(codeAddr + codeSize);
3649
3650// printf("codeAddr=%p, dataAddr=%p\n", codeAddr, s_data);
3651 if ((size_t)codeAddr & 0xffffffff00000000ULL || (size_t)s_data & 0xffffffff00000000ULL) {
3652 // printf("\naddress of code and data is over 4GB!!!\n");
3653 }
3654
3655 // setup data
3656 s_pTbl = s_data->pTbl;
3658 Fp::quarterTbl_ = s_data->quarterTbl;
3660
3661 for (size_t i = 0; i < pTblSize; i++) {
3662 Fp::setDirect(s_pTbl[i], p * int(i));
3663 }
3664 /*
3665 for option1
3666 lower 192-bits of pNTbl_[1] = 0
3667 lower 192-bits of pNTbl_[2] = 0
3668 */
3669 Fp::Dbl::pNTbl_[0].setDirect(pN);
3670 for (size_t h = 1; h < pNtblSize; ++h) {
3671 Fp::Dbl::pNTbl_[h].setDirect(pN >> h);
3672 }
3673 setTablesForDiv(p);
3674
3675 // setup code
3676 static PairingCode code(codeSize, codeAddr);
3677 code.init(p_, mode, useMulx);
3678 {
3679 Fp t(2);
3680 for (int i = 0; i < 512; i++) {
3681 invTbl_[511 - i] = t;
3682 t += t;
3683 }
3684 }
3685 return;
3686 } catch (std::exception& e) {
3687 fprintf(stderr, "setModulo ERR:%s\n", e.what());
3688 }
3689 ::exit(1);
3690}
3691
std::string one()
const mie::Vuint & p
Definition bn.cpp:27
const mie::Vuint & r
Definition bn.cpp:28
Xbyak::util::Clock sclk
BN parameter.
uint64_t debug_buf[128]
int g_count_m256
int g_count_add256
int g_count_r512
static uint8 * getAlignedAddress(uint8 *addr, size_t alignedSize=16)
Definition xbyak.h:987
static bool protect(const void *addr, size_t size, bool canExec)
Definition xbyak.h:966
const uint8 * getCurr() const
Definition xbyak.h:908
void shr(const Operand &op, const Reg8 &_cl)
Definition xbyak.h:730
void shl(const Operand &op, const Reg8 &_cl)
Definition xbyak.h:725
void call(const Operand &op)
Definition xbyak.h:2150
const Xmm & xm1
Definition xbyak.h:2084
const Xmm & xm5
Definition xbyak.h:2084
void and_(const Operand &op, uint32 imm)
Definition xbyak.h:21
const Xmm & xm3
Definition xbyak.h:2084
void xor_(const Operand &op, uint32 imm)
Definition xbyak.h:1279
void paddd(const Mmx &mmx, const Operand &op)
Definition xbyak.h:518
const Xmm & xm2
Definition xbyak.h:2084
void shrd(const Operand &op, const Reg &reg, const Reg8 &_cl)
Definition xbyak.h:732
void align(size_t x=16, bool useMultiByteNop=true)
Definition xbyak.h:2475
const Xmm & xm0
Definition xbyak.h:2084
void jmp(const Operand &op)
Definition xbyak.h:2144
void inc(const Operand &op)
Definition xbyak.h:307
void mulx(const Reg32e &r1, const Reg32e &r2, const Operand &op)
Definition xbyak.h:502
void test(const Operand &op, const Reg &reg)
Definition xbyak.h:2162
void add(const Operand &op, uint32 imm)
Definition xbyak.h:6
void ret(int imm=0)
Definition xbyak.h:667
void sub(const Operand &op, uint32 imm)
Definition xbyak.h:746
void movq(const Address &addr, const Mmx &mmx)
Definition xbyak.h:478
void cmovc(const Reg &reg, const Operand &op)
Definition xbyak.h:68
void shld(const Operand &op, const Reg &reg, const Reg8 &_cl)
Definition xbyak.h:727
void adc(const Operand &op, uint32 imm)
Definition xbyak.h:3
void jz(const Label &label, LabelType type=T_AUTO)
Definition xbyak.h:425
const Xmm & xm4
Definition xbyak.h:2084
void jnc(const Label &label, LabelType type=T_AUTO)
Definition xbyak.h:365
void pxor(const Mmx &mmx, const Operand &op)
Definition xbyak.h:653
void jc(const Label &label, LabelType type=T_AUTO)
Definition xbyak.h:325
void mov(const Operand &reg1, const Operand &reg2)
Definition xbyak.h:2210
void or_(const Operand &op, uint32 imm)
Definition xbyak.h:506
void pop(const Operand &op)
Definition xbyak.h:2190
void mul(const Operand &op)
Definition xbyak.h:497
void lea(const Reg &reg, const Address &addr)
Definition xbyak.h:432
void jnz(const Label &label, LabelType type=T_AUTO)
Definition xbyak.h:401
void L(const std::string &label)
Definition xbyak.h:2126
void push(const Operand &op)
Definition xbyak.h:2189
void sbb(const Operand &op, uint32 imm)
Definition xbyak.h:685
static void getCpuid(unsigned int eaxIn, unsigned int data[4])
Definition xbyak_util.h:172
bool has(Type type) const
Definition xbyak_util.h:368
static const Type tBMI2
Definition xbyak_util.h:226
static const Type tINTEL
Definition xbyak_util.h:229
Definition zm2.h:18
static void(* add)(Fp &out, const Fp &x, const Fp &y)
Definition zm2.h:83
static void(* addNC)(Fp &out, const Fp &x, const Fp &y)
Definition zm2.h:86
static void(* shr2)(Fp &out, const Fp &x)
Definition zm2.h:89
static void(* mul)(Fp &out, const Fp &x, const Fp &y)
Definition zm2.h:93
static int(* preInv)(Fp &r, const Fp &x)
Definition zm2.h:94
static mie::Fp * halfTbl_
Definition zm2.h:270
static const mie::Vuint & getModulo()
Definition zm2.h:262
@ N
Definition zm2.h:26
static const Fp & getDirectP(int n)
Definition zm2.cpp:126
static void(* neg)(Fp &out, const Fp &x)
Definition zm2.h:92
static void(* sub)(Fp &out, const Fp &x, const Fp &y)
Definition zm2.h:91
static void(* subNC)(Fp &out, const Fp &x, const Fp &y)
Definition zm2.h:87
static void(* shr1)(Fp &out, const Fp &x)
Definition zm2.h:88
static MIE_FORCE_INLINE void setDirect(Fp &out, const T &in)
Definition zm2.h:238
static void setModulo(const mie::Vuint &p, int mode, bool useMulx=true, bool definedBN_SUPPORT_SNARK=false)
Definition zm2.cpp:3592
static void setModulo(const V &m)
Definition zm.h:1331
const struct Ptn tbl[]
#define d1
#define P
Definition dtoa.c:437
#define d0
int * count
void init()
Definition lib_test.cpp:3
LOGGING_API void printf(Category category, const char *format,...)
Definition Logging.cpp:30
Definition xbyak.h:104
const To CastTo(From p)
Definition xbyak.h:279
Definition bn.h:56
Fp6::Dbl Fp6Dbl
Definition bn.h:2958
Fp2T< Fp > Fp2
Definition bn.h:2954
uint64_t y
Definition sha3.cpp:34
uint32_t Unit
Definition zm.h:66
uint32_t next(octet_iterator &it, octet_iterator end)
Definition checked.h:137
unsigned char uint8
const GenericPointer< typename T::ValueType > T2 T::AllocatorType & a
Definition pointer.h:1181
Xbyak::uint64 uint64
Definition quantize.cpp:51
const int N
Definition quantize.cpp:54
unsigned char uint8_t
Definition stdint.h:124
unsigned __int64 uint64_t
Definition stdint.h:136
Definition bench.cpp:18
Definition zm2.cpp:48
Fp halfTbl[2]
Definition zm2.cpp:50
Fp quarterTbl[4]
Definition zm2.cpp:51
Fp pTbl[pTblSize]
Definition zm2.cpp:49
FpDbl pNTbl[pNtblSize]
Definition zm2.cpp:52
static void(* square_n)(CompressT &z, int n)
Definition bn.h:2338
static void(* mul_Fp2_024)(Fp12T &z, const Fp6 &x)
Definition bn.h:1963
static void(* mul)(Fp12T &z, const Fp12T &x, const Fp12T &y)
Definition bn.h:1449
static void(* square)(Fp12T &z)
Definition bn.h:1475
static bin_op * sub
Definition bn.h:654
static void(* mod)(Fp2T &z, const Dbl &x)
Definition bn.h:660
static bin_op * subNC
Definition bn.h:655
static bin_op * addNC
Definition bn.h:652
static uni_op * mul_xi
Definition bn.h:662
static bin_op * add
Definition bn.h:651
static void(* square)(Dbl &z, const Fp2T &x)
Definition bn.h:659
static void(* mulOpt1)(Dbl &z, const Fp2T &x, const Fp2T &y)
Definition bn.h:657
static void(* mulOpt2)(Dbl &z, const Fp2T &x, const Fp2T &y)
Definition bn.h:658
static uni_op * neg
Definition bn.h:653
static void(* square)(Fp2T &z, const Fp2T &x)
Definition bn.h:373
static void(* addNC)(Fp2T &z, const Fp2T &x, const Fp2T &y)
Definition bn.h:369
static void(* divBy2)(Fp2T &z, const Fp2T &x)
Definition bn.h:376
static void(* mul_Fp_0)(Fp2T &z, const Fp2T &x, const Fp &b)
Definition bn.h:375
static void(* sub)(Fp2T &z, const Fp2T &x, const Fp2T &y)
Definition bn.h:370
static void(* mul_xi)(Fp2T &z, const Fp2T &x)
Definition bn.h:374
static void(* add)(Fp2T &z, const Fp2T &x, const Fp2T &y)
Definition bn.h:368
static void(* mul)(Fp2T &z, const Fp2T &x, const Fp2T &y)
Definition bn.h:372
static void(* mul)(Dbl &, const Fp6T &x, const Fp6T &y)
Definition bn.h:1262
Definition bn.h:837
static void(* pointDblLineEvalWithoutP)(Fp6T &l, Fp2 *R)
Definition bn.h:987
static void(* pointDblLineEval)(Fp6T &l, Fp2 *R, const Fp *P)
Definition bn.h:986
static void(* sub)(Fp6T &z, const Fp6T &x, const Fp6T &y)
Definition bn.h:983
static void(* add)(Fp6T &z, const Fp6T &x, const Fp6T &y)
Definition bn.h:982
static void(* mul)(Fp6T &z, const Fp6T &x, const Fp6T &y)
Definition bn.h:984
static MIE_FORCE_INLINE void setDirect(Dbl &out, const mie::Vuint &in)
Definition zm2.h:312
const Unit * const_ptr() const
Definition zm2.h:338
static uni_op * neg
Definition zm2.h:374
static void(* mod)(Fp &z, const Dbl &x)
Definition zm2.h:398
void bin_op(Dbl &z, const Dbl &x, const Dbl &y)
Definition zm2.h:366
static bin_op * sub
Definition zm2.h:379
static bin_op * subNC
Definition zm2.h:380
static Dbl * pNTbl_
Definition zm2.h:403
static bin_op * addNC
Definition zm2.h:372
static void(* mul)(Dbl &z, const Fp &x, const Fp &y)
Definition zm2.h:393
void uni_op(Dbl &z, const Dbl &x)
Definition zm2.h:365
static bin_op * add
Definition zm2.h:371
size_t size() const
Definition zm.h:519
void clear()
Definition zm.h:406
#define R
Xbyak ; JIT assembler for x86(IA32)/x64 by C++.
yh_object_type type
Definition yubihsm.h:672
CK_ULONG d
CK_RV ret
char * s
uint8_t buf[2048]
int l
c_gkp_out sizeof(template))
mie::ZmZ< mie::Vuint, Fp > Fp_emu
Definition zm2.cpp:61
const size_t pNtblSize
Definition zm2.cpp:47
const size_t pTblSize
Definition zm2.cpp:46
Data * s_data
Definition zm2.cpp:54