7#define XBYAK_NO_OP_NAMES
13#if defined(_MSC_VER) && (_MSC_VER <= 1500)
63static inline void Fp_addC(
Fp& out,
const Fp& x,
const Fp& y)
74static inline void Fp_addNC_C(
Fp& out,
const Fp& x,
const Fp& y)
81static inline void Fp_subNC_C(
Fp& out,
const Fp& x,
const Fp& y)
88static inline void Fp_subC(
Fp& out,
const Fp& x,
const Fp& y)
100static inline void Fp_mulC(
Fp& out,
const Fp& x,
const Fp& y)
110static void Fp_negC(
Fp& out,
const Fp& x)
112 static const Fp zero(0);
128 assert(0 <= n && (
size_t)n <
pTblSize);
155static void FpDbl_negC(
FpDbl &z,
const FpDbl &x)
186static void FpDbl_mulC(
FpDbl &z,
const Fp &x,
const Fp &y)
194static void FpDbl_modC(
Fp& out,
const FpDbl& x)
196 const size_t UnitLen =
sizeof(
mie::Unit) * 8;
200 const size_t n = 256 / UnitLen;
201 for (
size_t i = 0; i < n; i++) {
223using namespace Xbyak;
236 unsigned int data[4];
238 stepping = data[0] & mask(4);
239 model = (data[0] >> 4) & mask(4);
240 family = (data[0] >> 8) & mask(4);
241 type = (data[0] >> 12) & mask(2);
242 extModel = (data[0] >> 16) & mask(4);
243 extFamily = (data[0] >> 20) & mask(8);
244 if (family == 0x0f) {
245 displayFamily = family + extFamily;
247 displayFamily = family;
249 if (family == 6 || family == 0x0f) {
250 displayModel = (extModel << 4) + model;
252 displayModel = model;
255 unsigned int mask(
int n)
const
257 return (1U << n) - 1;
276bool interleaveLoad =
false;
277bool g_useMulx =
false;
279void detectCpu(
int mode,
bool useMulx)
290 interleaveLoad =
false;
293 interleaveLoad =
true;
296 interleaveLoad =
true;
297 if (!isIntel || (ext.family == 6 && ext.displayModel == 0x2a)) {
298 interleaveLoad =
false;
318struct PutDebugCounter {
321 if (debug_counter)
printf(
"debug_counter=%d\n", debug_counter);
326template<
class Code = PairingCode>
327struct MakeStackFrame {
330 MakeStackFrame(
Code *code,
int gtn,
int numQword = 0)
332 , P_(code_->storeReg(gtn, numQword))
334 code_->isRaxP_ =
false;
338 code_->restoreReg(P_);
348 Ext1(
const Reg64&
r,
int n = 0)
354 operator RegExp()
const {
return r_ + n_; }
360 void operator=(
const Ext1&);
365 Ext2(
const Reg64&
r,
int n = 0)
373 operator RegExp()
const {
return r_ + n_; }
381 void operator=(
const Ext2&);
386 Ext6(
const Reg64&
r,
int n = 0)
395 operator RegExp()
const {
return r_ + n_; }
404 void operator=(
const Ext6&);
409 Ext12(
const Reg64&
r,
int n = 0)
417 operator RegExp()
const {
return r_ + n_; }
425 void operator=(
const Ext12&);
432 void load_rm(
const Reg64& z3,
const Reg64& z2,
const Reg64& z1,
const Reg64& z0,
435 mov(z0, ptr [m + 8 * 0]);
436 mov(z1, ptr [m + 8 * 1]);
437 mov(z2, ptr [m + 8 * 2]);
438 mov(z3, ptr [m + 8 * 3]);
443 void store_mr(
const RegExp& m,
const Reg64& x3,
const Reg64& x2,
const Reg64& x1,
const Reg64& x0)
445 mov(ptr [m + 8 * 0], x0);
446 mov(ptr [m + 8 * 1], x1);
447 mov(ptr [m + 8 * 2], x2);
448 mov(ptr [m + 8 * 3], x3);
453 void add_rr(
const Reg64& z3,
const Reg64& z2,
const Reg64& z1,
const Reg64& z0,
454 const Reg64& x3,
const Reg64& x2,
const Reg64& x1,
const Reg64& x0)
464 void add_rm(
const Reg64& z3,
const Reg64& z2,
const Reg64& z1,
const Reg64& z0,
467 add(z0, ptr [m + 8 * 0]);
468 adc(z1, ptr [m + 8 * 1]);
469 adc(z2, ptr [m + 8 * 2]);
470 adc(z3, ptr [m + 8 * 3]);
473 void upCount(
int *
count)
484 void adc_rm(
const Reg64& z3,
const Reg64& z2,
const Reg64& z1,
const Reg64& z0,
487 adc(z0, ptr [m + 8 * 0]);
488 adc(z1, ptr [m + 8 * 1]);
489 adc(z2, ptr [m + 8 * 2]);
490 adc(z3, ptr [m + 8 * 3]);
492 void load_add_rm(
const Reg64& z3,
const Reg64& z2,
const Reg64& z1,
const Reg64& z0,
498 if (interleaveLoad) {
499 mov(z0, ptr [mx + 8 * 0]);
501 adc(z0, ptr [my + 8 * 0]);
503 add(z0, ptr [my + 8 * 0]);
505 mov(z1, ptr [mx + 8 * 1]);
506 adc(z1, ptr [my + 8 * 1]);
507 mov(z2, ptr [mx + 8 * 2]);
508 adc(z2, ptr [my + 8 * 2]);
509 mov(z3, ptr [mx + 8 * 3]);
510 adc(z3, ptr [my + 8 * 3]);
512 load_rm(z3, z2, z1, z0, mx);
514 adc_rm(z3, z2, z1, z0, my);
516 add_rm(z3, z2, z1, z0, my);
520 void load_sub_rm(
const Reg64& z3,
const Reg64& z2,
const Reg64& z1,
const Reg64& z0,
526 if (interleaveLoad) {
527 mov(z0, ptr [mx + 8 * 0]);
529 sbb(z0, ptr [my + 8 * 0]);
531 sub(z0, ptr [my + 8 * 0]);
533 mov(z1, ptr [mx + 8 * 1]);
534 sbb(z1, ptr [my + 8 * 1]);
535 mov(z2, ptr [mx + 8 * 2]);
536 sbb(z2, ptr [my + 8 * 2]);
537 mov(z3, ptr [mx + 8 * 3]);
538 sbb(z3, ptr [my + 8 * 3]);
540 load_rm(z3, z2, z1, z0, mx);
542 sbb_rm(z3, z2, z1, z0, my);
544 sub_rm(z3, z2, z1, z0, my);
551 void sub_rr(
const Reg64& z3,
const Reg64& z2,
const Reg64& z1,
const Reg64& z0,
552 const Reg64& x3,
const Reg64& x2,
const Reg64& x1,
const Reg64& x0)
562 void sub_rm(
const Reg64& z3,
const Reg64& z2,
const Reg64& z1,
const Reg64& z0,
565 sub(z0, ptr [m + 8 * 0]);
566 sbb(z1, ptr [m + 8 * 1]);
567 sbb(z2, ptr [m + 8 * 2]);
568 sbb(z3, ptr [m + 8 * 3]);
573 void sbb_rm(
const Reg64& z3,
const Reg64& z2,
const Reg64& z1,
const Reg64& z0,
576 sbb(z0, ptr [m + 8 * 0]);
577 sbb(z1, ptr [m + 8 * 1]);
578 sbb(z2, ptr [m + 8 * 2]);
579 sbb(z3, ptr [m + 8 * 3]);
581 void in_Fp_add_carry(
const RegExp& mz,
const RegExp& mx,
const RegExp& my,
bool withCarry)
583 if (interleaveLoad) {
584 mov(gt1, ptr [mx + 8 * 0]);
586 adc(gt1, ptr [my + 8 * 0]);
588 add(gt1, ptr [my + 8 * 0]);
590 mov(ptr [mz + 8 * 0], gt1);
592 mov(gt2, ptr [mx + 8 * 1]);
593 adc(gt2, ptr [my + 8 * 1]);
594 mov(ptr [mz + 8 * 1], gt2);
596 mov(gt3, ptr [mx + 8 * 2]);
597 adc(gt3, ptr [my + 8 * 2]);
598 mov(ptr [mz + 8 * 2], gt3);
600 mov(gt4, ptr [mx + 8 * 3]);
601 adc(gt4, ptr [my + 8 * 3]);
602 mov(ptr [mz + 8 * 3], gt4);
604 load_add_rm(gt4, gt3, gt2, gt1, mx, my, withCarry);
605 store_mr(mz, gt4, gt3, gt2, gt1);
608 void in_Fp_sub_carry(
const RegExp& mz,
const RegExp& mx,
const RegExp& my,
bool withCarry)
610 if (interleaveLoad) {
611 mov(gt1, ptr [mx + 8 * 0]);
613 sbb(gt1, ptr [my + 8 * 0]);
615 sub(gt1, ptr [my + 8 * 0]);
617 mov(ptr [mz + 8 * 0], gt1);
619 mov(gt2, ptr [mx + 8 * 1]);
620 sbb(gt2, ptr [my + 8 * 1]);
621 mov(ptr [mz + 8 * 1], gt2);
623 mov(gt3, ptr [mx + 8 * 2]);
624 sbb(gt3, ptr [my + 8 * 2]);
625 mov(ptr [mz + 8 * 2], gt3);
627 mov(gt4, ptr [mx + 8 * 3]);
628 sbb(gt4, ptr [my + 8 * 3]);
629 mov(ptr [mz + 8 * 3], gt4);
631 load_sub_rm(gt4, gt3, gt2, gt1, mx, my, withCarry);
632 store_mr(mz, gt4, gt3, gt2, gt1);
637 in_Fp_add_carry(mz, mx, my,
false);
641 in_Fp_sub_carry(mz, mx, my,
false);
645 in_Fp_add_carry(mz, mx, my,
true);
649 in_Fp_sub_carry(mz, mx, my,
true);
666 }
else if (my == mx) {
674 smart_set_gp(mz, mx, my);
679 smart_set_gp(mz, mx, my);
682 void make_Fp_addNC(
int n)
685 const Reg64& z = rcx;
686 const Reg64& x = rdx;
689 const Reg64& z = rdi;
690 const Reg64& x = rsi;
691 const Reg64&
y = rdx;
693 const Reg64& z2 = r9;
694 const Reg64& z1 = r10;
695 const Reg64& z0 = r11;
696 for (
int i = 0; i < n; i++) {
697 load_add_rm(rax, z2, z1, z0, x + 32 * i, y + 32 * i,
false);
698 store_mr(z + 32 * i, rax, z2, z1, z0);
705 const Reg64& z = rcx;
706 const Reg64& x = rdx;
709 const Reg64& z = rdi;
710 const Reg64& x = rsi;
711 const Reg64&
y = rdx;
713 const Reg64& z2 = r9;
714 const Reg64& z1 = r10;
715 const Reg64& z0 = r11;
716 load_sub_rm(x, z2, z1, z0, x, y,
false);
717 store_mr(z, x, z2, z1, z0);
726 void in_Fp_add_modp()
733 sub_rm(gt4, gt3, gt2, gt1, rax);
754 load_add_rm(gt4, gt3, gt2, gt1, mx, my,
false);
757 store_mr(mz, gt4, gt3, gt2, gt1);
764 void in_Fp_sub_modp()
772 and_(rdx, qword [rax + 8 * 0]);
773 and_(gt5, qword [rax + 8 * 1]);
774 and_(gt6, qword [rax + 8 * 2]);
775 and_(gt7, qword [rax + 8 * 3]);
782 cmovc(rdx, qword [rax + 8 * 0]);
783 cmovc(gt5, qword [rax + 8 * 1]);
784 cmovc(gt6, qword [rax + 8 * 2]);
785 cmovc(gt7, qword [rax + 8 * 3]);
787 add_rr(gt4, gt3, gt2, gt1, gt7, gt6, gt5, rdx);
790 add_rm(gt4, gt3, gt2, gt1, rax);
800 load_sub_rm(gt4, gt3, gt2, gt1, mx, my,
false);
802 store_mr(mz, gt4, gt3, gt2, gt1);
810 in_Fp_addNC(mz, mx, my);
811 load_add_rm(gt4, gt3, gt2, gt1, mx +
sizeof(
Fp), my +
sizeof(
Fp),
true);
813 store_mr(mz + 32, gt4, gt3, gt2, gt1);
821 in_Fp_subNC(mz, mx, my);
822 load_sub_rm(gt4, gt3, gt2, gt1, mx +
sizeof(
Fp), my +
sizeof(
Fp),
true);
824 store_mr(mz + 32, gt4, gt3, gt2, gt1);
828 smart_set_gp(mz, mx, my);
831 void set_p_FpDbl_add()
835 in_FpDbl_add(gp1, gp2, gp3);
838 void set_p_FpDbl_addNC()
842 in_Fp_addNC(gp1, gp2, gp3);
843 in_Fp_adcNC(gp1 + 32, gp2 + 32, gp3 + 32);
846 void set_p_FpDbl_subNC()
850 in_Fp_subNC(gp1, gp2, gp3);
851 in_Fp_sbbNC(gp1 + 32, gp2 + 32, gp3 + 32);
856 smart_set_gp(mz, mx, my);
863 void set_p_FpDbl_sub()
867 sub_FpDbl_sub(gp1, gp2, gp3);
872 smart_set_gp(mz, mx, my);
882 for (
int i = 0; i < n; i++) {
883 in_Fp_add(mz + 32 * i, mx + 32 * i, my + 32 * i);
889 load_rm(gt4, gt3, gt2, gt1, mx);
895 load_sub_rm(gt4, gt3, gt2, gt1, rax, mx,
false);
897 store_mr(mz, gt4, gt3, gt2, gt1);
899 void in_Fp_neg(
int n,
const RegExp& mz,
const RegExp& mx)
902 for (
int i = 0; i < n; i++) {
903 in_Fp_neg(mz + 32 * i, mx + 32 * i);
922 in_Fp_neg(2, gp1, gp2);
928 smart_set_gp(mz, mx, my);
933 smart_set_gp(mz, mx, my);
936 void set_p_Fp2_addNC()
940 in_Fp_addNC(gp1, gp2, gp3);
941 in_Fp_addNC(gp1 + 32, gp2 + 32, gp3 + 32);
948 in_Fp_add(2, gp1, gp2, gp3);
955 in_Fp_sub(2, gp1, gp2, gp3);
960 smart_set_gp(mz, mx, my);
966 for (
int i = 0; i < n; i++) {
967 in_Fp_sub(mz + 32 * i, mx + 32 * i, my + 32 * i);
972 for (
int i = 0; i < n; i++) {
973 in_FpDbl_add(mz + 64 * i, mx + 64 * i, my + 64 * i);
978 for (
int i = 0; i < n; i++) {
979 in_FpDbl_addNC(mz + 64 * i, mx + 64 * i, my + 64 * i);
984 for (
int i = 0; i < n; i++) {
985 sub_FpDbl_sub(mz + 64 * i, mx + 64 * i, my + 64 * i);
994 void make_Fp_add(
int n)
996 MakeStackFrame<> sf(
this, 7);
997 in_Fp_add(n, gp1, gp2, gp3);
999 void make_Fp_sub(
int n)
1001 MakeStackFrame<> sf(
this, 7);
1002 in_Fp_sub(n, gp1, gp2, gp3);
1004 void set_p_Fp6_add()
1008 in_Fp_add(6, gp1, gp2, gp3);
1013 MakeStackFrame<> sf(
this, 7);
1016 void set_p_Fp6_sub()
1020 in_Fp_sub(6, gp1, gp2, gp3);
1025 MakeStackFrame<> sf(
this, 7);
1032 const Reg64& z = rcx;
1033 const Reg64& x = rdx;
1035 const Reg64& z = rdi;
1036 const Reg64& x = rsi;
1038 const Reg64& z3 = r8;
1039 const Reg64& z2 = r9;
1040 const Reg64& z1 = r10;
1041 const Reg64& z0 = r11;
1043 load_rm(z3, z2, z1, z0, x);
1050 load_sub_rm(z3, z2, z1, z0, rax, x,
false);
1052 store_mr(z, z3, z2, z1, z0);
1058 void shrn(
const Reg64& x3,
const Reg64& x2,
const Reg64& x1,
const Reg64& x0,
uint8 n)
1068 void shr1(
const Reg64& x3,
const Reg64& x2,
const Reg64& x1,
const Reg64& x0)
1070 shrn(x3, x2, x1, x0, 1);
1075 void shl1(
const Reg64& x3,
const Reg64& x2,
const Reg64& x1,
const Reg64& x0)
1077 add_rr(x3, x2, x1, x0, x3, x2, x1, x0);
1088 load_rm(gt4, gt3, gt2, gt1, x);
1089 shr1(gt4, gt3, gt2, gt1);
1092 add_rm(gt4, gt3, gt2, gt1, rdx);
1093 store_mr(z, gt4, gt3, gt2, gt1);
1095 void set_p_Fp2_divBy2()
1099 const Reg64& z = gp1;
1100 const Reg64& x = gp2;
1102 sub_Fp_divBy2(z, x);
1103 sub_Fp_divBy2(z + 32, x + 32);
1106 void make_Fp2_divBy2()
1108 MakeStackFrame<> sf(
this, 7);
1114 void shl1(
const Reg64& x,
const Reg64& t)
1116 mov(t, ptr [x + 8 * 0]);
1117 add(ptr [x + 8 * 0], t);
1118 mov(t, ptr [x + 8 * 1]);
1119 adc(ptr [x + 8 * 1], t);
1120 mov(t, ptr [x + 8 * 2]);
1121 adc(ptr [x + 8 * 2], t);
1122 mov(t, ptr [x + 8 * 3]);
1123 adc(ptr [x + 8 * 3], t);
1125 void make_Fp_shr(
uint8 n = 1)
1128 const Reg64& z = rcx;
1129 const Reg64& x = rdx;
1131 const Reg64& z = rdi;
1132 const Reg64& x = rsi;
1134 const Reg64& z3 = r8;
1135 const Reg64& z2 = r9;
1136 const Reg64& z1 = r10;
1137 const Reg64& z0 = r11;
1138 load_rm(z3, z2, z1, z0, x);
1139 shrn(z3, z2, z1, z0, n);
1140 store_mr(z, z3, z2, z1, z0);
1147 void mul4x1(
const RegExp& py,
const Reg64& x,
const Reg64& t3,
const Reg64& t2,
const Reg64& t1,
const Reg64& t0,
1150 const Reg64&
a = rax;
1151 const Reg64&
d = rdx;
1154 mulx(t1, t0, ptr [py + 8 * 0]);
1155 mulx(t2,
a, ptr [py + 8 * 1]);
1157 mulx(x,
a, ptr [py + 8 * 2]);
1159 mulx(d,
a, ptr [py + 8 * 3]);
1168 mov(
a, ptr [py + 8]);
1172 mov(
a, ptr [py + 8 * 2]);
1177 mul(qword [py + 8 * 3]);
1195 void montgomery1(
const Reg64& c4,
const Reg64& c3,
const Reg64& c2,
const Reg64& c1,
const Reg64& c0,
1196 const Reg64& px,
const Reg64& y,
const Reg64&
p,
1197 const Reg64& t0,
const Reg64& t1,
const Reg64& t2,
const Reg64& t3,
const Reg64& t4,
bool isFirst)
1199 const Reg64&
a = rax;
1200 const Reg64&
d = rdx;
1202 mul4x1(px, y, c3, c2, c1, c0, c4);
1206 mul4x1(px, y, t3, t2, t1, t0, t4);
1208 add_rr(y, c2, c1, c0, c3, t2, t1, t0);
1214 mul4x1(
p, c3, t3, t2, t1, t0, t4);
1237 mov(gp3, ptr [gp3]);
1238 montgomery1(gt1, gt8, gt4, gt3, gt2, gp2, gp3, gp1, gt5, gt6, gt7, gt9, gt10,
true);
1241 mov(gp3, ptr [gp3 + 8]);
1242 montgomery1(gt2, gt1, gt8, gt4, gt3, gp2, gp3, gp1, gt5, gt6, gt7, gt9, gt10,
false);
1245 mov(gp3, ptr [gp3 + 16]);
1246 montgomery1(gt3, gt2, gt1, gt8, gt4, gp2, gp3, gp1, gt5, gt6, gt7, gt9, gt10,
false);
1249 mov(gp3, ptr [gp3 + 24]);
1250 montgomery1(gt4, gt3, gt2, gt1, gt8, gp2, gp3, gp1, gt5, gt6, gt7, gt9, gt10,
false);
1257 sub_rm(gt4, gt3, gt2, gt1, gp1);
1264 store_mr(gp1, gt4, gt3, gt2, gt1);
1274 void set_p_FpDbl_mod()
1284 MakeStackFrame<> sf(
this, 10);
1288 void make_Fp2_mul_Fp_0()
1290 MakeStackFrame<> sf(
this, 10);
1294 add(gp1,
sizeof(
Fp));
1295 add(gp2,
sizeof(
Fp));
1303 const Reg64& t9,
const Reg64& t8,
const Reg64& t7,
const Reg64& t6,
const Reg64& t5,
const Reg64& t4,
const Reg64& t3,
const Reg64& t2,
const Reg64& t1,
const Reg64& t0)
1308 const Reg64&
a = rax;
1309 const Reg64&
d = rdx;
1313 mulx(t0,
a, ptr [py + 8 * 0]);
1314 mov(ptr [pz + 8 * 0],
a);
1315 mulx(t1,
a, ptr [py + 8 * 1]);
1317 mulx(t2,
a, ptr [py + 8 * 2]);
1319 mulx(t3,
a, ptr [py + 8 * 3]);
1324 mov(
a, ptr [py + 8 * 0]);
1326 mov(ptr [pz + 8 * 0],
a);
1328 mov(
a, ptr [py + 8 * 1]);
1332 mov(
a, ptr [py + 8 * 2]);
1336 mov(
a, ptr [py + 8 * 3]);
1347 mov(t9, ptr [px + 8]);
1350 mul4x1(py, t9, t8, t7, t6, t5, t4);
1351 add_rr(t3, t2, t1, t0, t9, t7, t6, t5);
1354 mov(ptr [pz + 8], t0);
1357 mov(t9, ptr [px + 16]);
1360 mul4x1(py, t9, t7, t6, t5, t4, t0);
1361 add_rr(t8, t3, t2, t1, t9, t6, t5, t4);
1364 mov(ptr [pz + 16], t1);
1366 mov(t9, ptr [px + 24]);
1369 mul4x1(py, t9, t6, t5, t4, t1, t0);
1370 add_rr(t7, t8, t3, t2, t9, t5, t4, t1);
1372 store_mr(pz + 8 * 3, t7, t8, t3, t2);
1373 mov(ptr [pz + 8 * 7], d);
1386 const Reg64&
a = rax;
1387 const Reg64&
d = rdx;
1390 mov(gp1, ptr [gp2 + 8 * 0]);
1398 mul4x1(gp3, gt7, gt4, gt3, gt2, gt1, gt8);
1401 adc(gt2, qword [gp2 + 8 * 1]);
1402 adc(gt3, qword [gp2 + 8 * 2]);
1403 adc(gt7, qword [gp2 + 8 * 3]);
1404 mov(gt4, ptr [gp2 + 8 * 4]);
1406 mov(gt8, ptr [gp2 + 8 * 5]);
1408 mov(gt9, ptr [gp2 + 8 * 6]);
1410 mov(gt10, ptr [gp2 + 8 * 7]);
1421 mul4x1(gp3, gp1, gt1, gt5, gp2, gt6, gt10);
1424 add_rr(gt4, gt7, gt3, gt2, gp1, gt5, gp2, gt6);
1436 mul4x1(gp3, gp1, gt1, gt5, gp2, gt6, gt2);
1438 add_rr(gt8, gt4, gt7, gt3, gp1, gt5, gp2, gt6);
1449 mul4x1(gp3, gp1, gt1, gt5, gp2, gt6, gt2);
1451 add_rr(gt9, gt8, gt4, gt7, gp1, gt5, gp2, gt6);
1458 sub_rm(gt10, gt9, gt8, gt4, gp3);
1465 store_mr(gp1, gt10, gt9, gt8, gt4);
1467#ifdef BN_SUPPORT_SNARK
1472 void in_Fp_mul_xi_addsub(
const RegExp& mz,
const RegExp& mx,
const RegExp& my,
bool doAdd)
1485 mul4x1(mx, gt4, gt5, gt3, gt2, gt1, gt6);
1487 add_rm(gt4, gt3, gt2, gt1, my);
1491 add_rm(gt4, gt3, gt2, gt1, rax);
1493 sub_rm(gt4, gt3, gt2, gt1, my);
1504 sub_rm(gt4, gt3, gt2, gt1, rax - 32 + rdx);
1506 store_mr(mz, gt4, gt3, gt2, gt1);
1513#ifdef BN_SUPPORT_SNARK
1516 in_Fp_mul_xi_addsub(mz, mx, mx + 32,
false);
1517 in_Fp_mul_xi_addsub(mz + 32, mx + 32, mx,
true);
1519 in_Fp_add(mz, mx, mx);
1520 in_Fp_add(mz, mz, mz);
1521 in_Fp_add(mz, mz, mz);
1522 in_Fp_add(mz, mz, mx);
1523 in_Fp_sub(mz, mz, mx + 32);
1525 in_Fp_add(mz + 32, mx + 32, mx + 32);
1526 in_Fp_add(mz + 32, mz + 32, mz + 32);
1527 in_Fp_add(mz + 32, mz + 32, mz + 32);
1528 in_Fp_add(mz + 32, mz + 32, mx + 32);
1529 in_Fp_add(mz + 32, mz + 32, mx);
1532 in_Fp_sub(mz, mx, mx + 32);
1533 in_Fp_add(mz + 32, mx, mx + 32);
1536 void make_Fp2_mul_xi()
1538 MakeStackFrame<> sf(
this, 7);
1539 in_Fp2_mul_xi(gp1, gp2);
1549 load_rm(gt4, gt3, gt2, gt1, mx);
1554 load_rm(gt4, gt3, gt2, gt1, mx + 32);
1560 jnz(
".neg", T_NEAR);
1565 store_mr(mz, rdx, rdx, rdx, rdx);
1566 store_mr(mz + 32, rdx, rdx, rdx, rdx);
1568 jmp(
".exit", T_NEAR);
1574 in_Fp_subNC(mz, rax, mx);
1575 in_Fp_sbbNC(mz + 32, rax + 32, mx + 32);
1579 void make_FpDbl_neg()
1581 MakeStackFrame<> sf(
this, 4);
1582 in_FpDbl_neg(gp1, gp2);
1584 void make_Fp2Dbl_neg()
1586 MakeStackFrame<> sf(
this, 4);
1587 in_FpDbl_neg(gp1, gp2);
1588 in_FpDbl_neg(gp1 + 64, gp2 + 64);
1594 void make_FpDbl_add(
int n)
1596 MakeStackFrame<> sf(
this, 7);
1597 in_FpDbl_add(n, gp1, gp2, gp3);
1602 void make_FpDbl_sub(
int n)
1604 MakeStackFrame<> sf(
this, 7);
1605 in_FpDbl_sub(n, gp1, gp2, gp3);
1611 void make_FpDbl_addNC(
int n)
1613 MakeStackFrame<> sf(
this, 7);
1614 in_FpDbl_addNC(n, gp1, gp2, gp3);
1619 void make_FpDbl_subNC(
int n)
1621 MakeStackFrame<> sf(
this, 7);
1622 for (
int i = 0; i < n; i++) {
1623 in_Fp_subNC(gp1 + 64 * i, gp2 + 64 * i, gp3 + 64 * i);
1624 in_Fp_sbbNC(gp1 + 64 * i + 32, gp2 + 64 * i + 32, gp3 + 64 * i + 32);
1627 void in_Fp2Dbl_mul_xi(
const RegExp& mz,
const RegExp& mx)
1632 call(p_Fp2Dbl_mul_xi);
1634 void make_Fp2Dbl_mul_xi()
1636 MakeStackFrame<> sf(
this, 7);
1637 call(p_Fp2Dbl_mul_xi);
1643 void make_FpDbl_mul()
1645 MakeStackFrame<> sf(
this, 10);
1646 mul4x4(gp1, gp2, gp3, gt9, gt8, gt7, gt6, gt5, gt4, gt3, gt2, gt1, gt10);
1652 void make_FpDbl_mod()
1654 MakeStackFrame<> sf(
this, 10);
1660 void in_Fp2Dbl_mod()
1674 void make_Fp2Dbl_mod()
1676 MakeStackFrame<> sf(
this, 10);
1687 const Reg64& x3,
const Reg64& x2,
const Reg64& x1,
const Reg64& x0,
1688 const Reg64& t0,
const Reg64& t1,
const Reg64& t2,
const Reg64& t3)
1690 const Reg64&
a = rax;
1696 sub_rm(x3, x2, x1, x0,
a + rdx);
1699 load_rm(t3, t2, t1, t0,
a +
sizeof(
Fp));
1704 add_rr(x3, x2, x1, x0, t3, t2, t1, t0);
1709 void set_p_Fp2_square()
1717 void in_Fp2_square()
1720 const Ext2<Fp> z(gp1);
1721 const Ext2<Fp> x(gp2);
1722 const Ext1<Fp> t(rsp);
1723 const Ext1<FpDbl>
d0(rsp, t.next);
1724 const Ext1<FpDbl>
d1(rsp,
d0.next);
1725 const int SS =
d1.next;
1728#ifdef BN_SUPPORT_SNARK
1730 load_rm(gt4, gt3, gt2, gt1, x.b_);
1731 add_rr(gt4, gt3, gt2, gt1, gt4, gt3, gt2, gt1);
1733 store_mr(t, gt4, gt3, gt2, gt1);
1736 mul4x4(
d0, t, x, gt10, gt9, gt8, gt7, gt6, gt5, gt4, gt3, gt2, gt1);
1740 load_add_rm(gt4, gt3, gt2, gt1, x.a_, rax,
false);
1741 sub_rm(gt4, gt3, gt2, gt1, x.b_);
1742 store_mr(t, gt4, gt3, gt2, gt1);
1744 in_Fp_add(z.a_, x.a_, x.b_);
1746 load_rm(gt4, gt3, gt2, gt1, x.b_);
1747 add_rr(gt4, gt3, gt2, gt1, gt4, gt3, gt2, gt1);
1748 store_mr(t, gt4, gt3, gt2, gt1);
1751 mul4x4(
d0, t, x, gt10, gt9, gt8, gt7, gt6, gt5, gt4, gt3, gt2, gt1);
1755 load_add_rm(gt4, gt3, gt2, gt1, x.a_, rax,
false);
1756 sub_rm(gt4, gt3, gt2, gt1, x.b_);
1757 store_mr(t, gt4, gt3, gt2, gt1);
1759 in_Fp_add_carry(z.a_, x.a_, x.b_,
false);
1762 mul4x4(
d1, t, z.a_, gt10, gt9, gt8, gt7, gt6, gt5, gt4, gt3, gt2, gt1);
1779 void make_Fp2_square()
1781 MakeStackFrame<> sf(
this, 10);
1787 void sub_Fp2Dbl_subNC(
const RegExp& pz,
const RegExp& px,
1788 const Reg64& t3,
const Reg64& t2,
const Reg64& t1,
const Reg64& t0)
1790 load_sub_rm(t3, t2, t1, t0, pz, px,
false);
1791 store_mr(pz + 8 * 0, t3, t2, t1, t0);
1793 load_sub_rm(t3, t2, t1, t0, pz +
sizeof(
Fp), px +
sizeof(
Fp),
true);
1794 store_mr(pz +
sizeof(
Fp), t3, t2, t1, t0);
1805 load_rm(gt4, gt3, gt2, gt1, mx);
1807 add(gt4, ptr [rax + 8 * 3]);
1808 load_add_rm(rdx, gt7, gt6, gt5, mx +
sizeof(
Fp), rax +
sizeof(
Fp),
true);
1809 sub_rm(gt4, gt3, gt2, gt1, my);
1810 store_mr(mz, gt4, gt3, gt2, gt1);
1811 sbb_rm(rdx, gt7, gt6, gt5, my +
sizeof(
Fp));
1812 store_mr(mz + 32, rdx, gt7, gt6, gt5);
1816 void set_p_Fp2_mul()
1822 const Ext2<Fp> z(gp1);
1823 const Ext2<Fp> x(gp2);
1824 const Ext2<Fp>
y(gp3);
1826 const Ext1<Fp>
s(rsp);
1827 const Ext1<Fp> t(rsp,
s.next);
1828 const Ext1<FpDbl>
d0(rsp, t.next);
1829 const Ext1<FpDbl>
d1(rsp,
d0.next);
1830 const Ext1<FpDbl> d2(rsp,
d1.next);
1831 const int SS = d2.next;
1834 in_Fp_addNC(
s, x.a_, x.b_);
1836 in_Fp_addNC(t,
y.a_,
y.b_);
1838 mul4x4(
d0,
s, t, gt9, gt8, gt7, gt6, gt5, gt4, gt3, gt2, gt1, gt10);
1839 mul4x4(
d1, x.a_,
y.a_, gt9, gt8, gt7, gt6, gt5, gt4, gt3, gt2, gt1, gt10);
1840 mul4x4(d2, x.b_,
y.b_, gt9, gt8, gt7, gt6, gt5, gt4, gt3, gt2, gt1, gt10);
1843 sub_Fp2Dbl_subNC(
d0,
d1, gt3, gt2, gt1, gt10);
1845 sub_Fp2Dbl_subNC(
d0, d2, gt3, gt2, gt1, gt10);
1847 sub_FpDbl_sub(
d1,
d1, d2);
1854 add(gp1,
sizeof(
Fp));
1865 MakeStackFrame<> sf(
this, 10);
1870 void make_Fp_preInv()
1872 MakeStackFrame<> sf(
this, 10, 4);
1873 const Reg64&
r = gp1;
1874 const Reg64& v0 = gp2;
1875 const Reg64& v1 = gp3;
1876 const Reg64& v2 = gt1;
1877 const Reg64& v3 = gt2;
1878 const Reg64& u0 = gt3;
1879 const Reg64& u1 = gt4;
1880 const Reg64& u2 = gt5;
1881 const Reg64& u3 = gt6;
1882 const Reg64& s0 = gt7;
1883 const Reg64& s1 = gt8;
1884 const Reg64& s2 = gt9;
1885 const Reg64& s3 = gt10;
1886 const Reg64& t = rdx;
1889 const Reg64&
a = rax;
1897 load_rm(u3, u2, u1, u0, t);
1898 mov(v3, ptr [v0 + 8 * 3]);
1899 mov(v2, ptr [v0 + 8 * 2]);
1900 mov(v1, ptr [v0 + 8 * 1]);
1901 mov(v0, ptr [v0 + 8 * 0]);
1903 lea(s0, ptr [s3 + 1]);
1908 mov(ptr [rsp + 8 * 0], s3);
1909 mov(ptr [rsp + 8 * 1], s3);
1910 mov(ptr [rsp + 8 * 2],
r);
1924 jz(
".exit", T_NEAR);
1926 jz(
".u_even", T_NEAR);
1933 sub_rr(v3, v2, v1, v0, u3, u2, u1, u0);
1935 add(s0, ptr [rsp + 8 * 0]);
1936 adc(s1, ptr [rsp + 8 * 1]);
1940 shr1(v3, v2, v1, v0);
1941 mov(t, ptr [rsp + 8 * 0]);
1942 add(ptr [rsp + 8 * 0], t);
1943 mov(t, ptr [rsp + 8 * 1]);
1944 adc(ptr [rsp + 8 * 1], t);
1955 sub_rr(u3, u2, u1, u0, v3, v2, v1, v0);
1956 add(ptr [rsp + 8 * 0], s0);
1957 adc(ptr [rsp + 8 * 1], s1);
1961 shr1(u3, u2, u1, u0);
1962 shl1(s3, s2, s1, s0);
1970 load_rm(s3, s2, s1, s0, t);
1971 sub(s0, ptr [rsp + 8 * 0]);
1972 sbb(s1, ptr [rsp + 8 * 1]);
1975 mov(
r, ptr [rsp + 8 * 2]);
1976 store_mr(
r, s3, s2, s1, s0);
1986 sub(ptr [gt1], eax);
1987 sbb(ptr [gt1 + 4], edx);
1993 add(ptr [gt1], eax);
1994 adc(ptr [gt1 + 4], edx);
1995 inc(dword [gt1 + 8]);
2003 void sub_Fp2Dbl_mulOpt(
int mode)
2011 Ext1<Fp> t(rsp,
s.next);
2012 Ext1<FpDbl>
d0(rsp, t.next);
2013 const int SS =
d0.next;
2017 in_Fp_addNC(
s, x.a_, x.b_);
2019 in_Fp_addNC(t,
y.a_,
y.b_);
2021 mul4x4(
d0, x.b_,
y.b_, gt9, gt8, gt7, gt6, gt5, gt4, gt3, gt2, gt1, gt10);
2022 mul4x4(z.a_, x.a_,
y.a_, gt9, gt8, gt7, gt6, gt5, gt4, gt3, gt2, gt1, gt10);
2023 mul4x4(z.b_,
s, t, gt9, gt8, gt7, gt6, gt5, gt4, gt3, gt2, gt1, gt10);
2026 load_sub_rm(gt3, gt2, gt1, gt10, z.b_, z.a_,
false);
2028 load_sub_rm(gt7, gt6, gt5, gt4, (
RegExp)z.b_ +
sizeof(
Fp), (
RegExp)z.a_ +
sizeof(
Fp),
true);
2030 sub_rm(gt3, gt2, gt1, gt10,
d0);
2031 sbb_rm(gt7, gt6, gt5, gt4, (
RegExp)
d0 +
sizeof(
Fp));
2034 store_mr(z.b_, gt3, gt2, gt1, gt10);
2035 store_mr((
RegExp)z.b_ +
sizeof(
Fp), gt7, gt6, gt5, gt4);
2039 in_FpDbl_subOpt1(z.a_, z.a_,
d0);
2043 sub_FpDbl_sub(z.a_, z.a_,
d0);
2048 void set_p_Fp2Dbl_mulOpt(
int mode)
2059 printf(
"err set_p_Fp2Dbl_mulOpt mode=%d\n", mode);
2061 sub_Fp2Dbl_mulOpt(mode);
2063 void set_p_Fp2Dbl_mul_xi()
2068#ifdef BN_SUPPORT_SNARK
2069 in_FpDbl_add(gp1, gp2, gp2);
2070 in_FpDbl_add(gp1, gp1, gp1);
2071 in_FpDbl_add(gp1, gp1, gp1);
2072 in_FpDbl_add(gp1, gp1, gp2);
2073 sub_FpDbl_sub(gp1, gp1, gp2 +
sizeof(
FpDbl));
2075 in_FpDbl_add(gp1 + 64, gp2 +
sizeof(
FpDbl), gp2 +
sizeof(
FpDbl));
2076 in_FpDbl_add(gp1 + 64, gp1 + 64, gp1 + 64);
2077 in_FpDbl_add(gp1 + 64, gp1 + 64, gp1 + 64);
2078 in_FpDbl_add(gp1 + 64, gp1 + 64, gp2 +
sizeof(
FpDbl));
2079 in_FpDbl_add(gp1 + 64, gp1 + 64, gp2);
2081 sub_FpDbl_sub(gp1, gp2, gp2 +
sizeof(
FpDbl));
2082 in_FpDbl_add(gp1 + 64, gp2 +
sizeof(
FpDbl), gp2);
2088 void make_Fp2Dbl_mulOpt(
int mode)
2090 MakeStackFrame<> sf(
this, 10);
2092 call(p_Fp2Dbl_mulOpt1);
2094 call(p_Fp2Dbl_mulOpt2);
2103 void set_p_Fp6Dbl_mul()
2111 const Ext6<FpDbl> z(gt8);
2112 const Ext6<Fp> x(gt9);
2113 const Ext6<Fp>
y(gt10);
2114 const Ext2<Fp> t0(rsp);
2115 const Ext2<Fp> t1(rsp, t0.next);
2116 const Ext2<FpDbl> T0(rsp, t1.next);
2117 const Ext2<FpDbl> T1(rsp, T0.next);
2118 const Ext2<FpDbl> T2(rsp, T1.next);
2119 const int SS = T2.next;
2131 lea(gp2, ptr [x.a_]);
2132 lea(gp3, ptr [
y.a_]);
2133 call(p_Fp2Dbl_mulOpt1);
2139 lea(gp2, ptr [x.b_]);
2140 lea(gp3, ptr [
y.b_]);
2141 call(p_Fp2Dbl_mulOpt1);
2147 lea(gp2, ptr [x.c_]);
2148 lea(gp3, ptr [
y.c_]);
2149 call(p_Fp2Dbl_mulOpt1);
2154 in_Fp2_addNC(t0, x.b_, x.c_);
2156 in_Fp2_addNC(t1,
y.b_,
y.c_);
2160 lea(gp1, ptr [z.c_]);
2163 call(p_Fp2Dbl_mulOpt2);
2169 in_FpDbl_addNC(2, z.b_, T1, T2);
2172 in_FpDbl_sub(z.c_.a_, z.c_.a_, z.b_.a_);
2175 in_FpDbl_subNC(z.c_.b_, z.c_.b_, z.b_.b_);
2178 in_Fp2Dbl_mul_xi(z.b_, z.c_);
2181 in_Fp2Dbl_add(z.a_, z.b_, T0);
2184 in_Fp2_addNC(t0, x.a_, x.b_);
2187 in_Fp2_addNC(t1,
y.a_,
y.b_);
2190 lea(gp1, ptr [z.c_]);
2193 call(p_Fp2Dbl_mulOpt2);
2200 in_FpDbl_addNC(2, z.b_, T0, T1);
2203 in_FpDbl_sub(z.c_.a_, z.c_.a_, z.b_.a_);
2206 in_FpDbl_subNC(z.c_.b_, z.c_.b_, z.b_.b_);
2209#ifdef BN_SUPPORT_SNARK
2210 in_Fp2Dbl_mul_xi(z.b_, T2);
2212 in_FpDbl_subOpt1(z.b_.a_, T2.a_, T2.b_);
2215 in_FpDbl_add(z.b_.b_, T2.a_, T2.b_);
2219 in_Fp2Dbl_add(z.b_, z.b_, z.c_);
2222 in_Fp2_addNC(t0, x.a_, x.c_);
2225 in_Fp2_addNC(t1,
y.a_,
y.c_);
2228 lea(gp1, ptr [z.c_]);
2231 call(p_Fp2Dbl_mulOpt2);
2236 in_FpDbl_addNC(2, T2, T2, T0);
2239 in_FpDbl_sub(z.c_.a_, z.c_.a_, T2.a_);
2242 in_FpDbl_add(z.c_.a_, z.c_.a_, T1.a_);
2245 load_sub_rm(gt4, gt3, gt2, gt1, z.c_.b_, T2.b_,
false);
2246 load_sub_rm(rdx, rax, gt6, gt5, (
RegExp)z.c_.b_ +
sizeof(
Fp), (
RegExp)T2.b_ +
sizeof(
Fp),
true);
2247 add_rm(gt4, gt3, gt2, gt1, T1.b_);
2248 adc_rm(rdx, rax, gt6, gt5, (
RegExp)T1.b_ +
sizeof(
Fp));
2249 store_mr(z.c_.b_, gt4, gt3, gt2, gt1);
2250 store_mr((
RegExp)z.c_.b_ +
sizeof(
Fp), rdx, rax, gt6, gt5);
2253 in_FpDbl_subNC(z.c_.b_, z.c_.b_, T2.b_);
2256 in_FpDbl_addNC(z.c_.b_, z.c_.b_, T1.b_);
2264 void make_Fp6Dbl_mul()
2266 MakeStackFrame<> sf(
this, 10);
2275 void set_p_Fp6_mul()
2280 const int SS =
sizeof(
Fp6Dbl);
2288 for (
int i = 0; i < 6; i++) {
2294 lea(gp2, ptr [rsp + 64 * i]);
2306 MakeStackFrame<> sf(
this, 10);
2310 void debug_save_buf(
const RegExp& m,
int n)
2316 mov(rcx, (
size_t)save);
2317 mov(ptr [rcx], rax);
2318 mov(ptr [rcx + 8], rdx);
2319 mov(ptr [rcx + 16], rbx);
2324 for (
int i = 0; i < n; i++) {
2325 mov(rax, ptr [rbx + i * 8]);
2326 mov(ptr [rdx + i * 8], rax);
2329 mov(rcx, (
size_t)save);
2330 mov(rax, ptr [rcx]);
2331 mov(rdx, ptr [rcx + 8]);
2332 mov(rbx, ptr [rbx + 16]);
2335 void debug_count_inc()
2338 mov(rax, (
size_t)&debug_counter);
2339 add(dword [rax], 1);
2346 void make_Compress_square_n()
2350 const Ext2<Fp> t0(rsp);
2351 const Ext2<Fp> t1(rsp, t0.next);
2352 const Ext2<Fp> t2(rsp, t1.next);
2353 const Ext2<FpDbl> T0(rsp, t2.next);
2354 const Ext2<FpDbl> T1(rsp, T0.next);
2355 const Ext2<FpDbl> T2(rsp, T1.next);
2356 const Ext2<FpDbl> T3(rsp, T2.next);
2357 const int nsave = T3.next;
2358 const int SS = nsave + 8;
2360 MakeStackFrame<> sf(
this, 10, SS / 8);
2362 const Reg64& z = gt10;
2364 const int g2 =
sizeof(
Fp2) * 3;
2365 const int g3 =
sizeof(
Fp2) * 2;
2366 const int g4 =
sizeof(
Fp2) * 1;
2367 const int g5 =
sizeof(
Fp2) * 5;
2369 mov(ptr [rsp + nsave], gp2);
2377 lea(gp2, ptr [z + g4]);
2378 call(p_Fp2Dbl_square);
2384 call(p_Fp2Dbl_square);
2387 in_Fp2Dbl_mul_xi(T2, T1);
2390 in_Fp2Dbl_add(T2, T2, T0);
2399 in_Fp2_add(t0, z + g4, z + g5);
2404 call(p_Fp2Dbl_square);
2409 in_FpDbl_add(2, T0, T0, T1);
2412 in_Fp2Dbl_sub(T2, T2, T0);
2421 in_Fp2_add(t1, z + g2, z + g3);
2426 call(p_Fp2Dbl_square);
2431 lea(gp2, ptr [z + g2]);
2432 call(p_Fp2Dbl_square);
2435 in_Fp2_mul_xi(t1, t0);
2438 lea(gp1, ptr [z + g2]);
2440 call(p_Fp2_2z_add_3x);
2443 in_Fp2_add(z + g2, z + g2, t1);
2446 in_Fp2_add(z + g2, z + g2, z + g2);
2449 in_Fp2_add(z + g2, z + g2, t1);
2453 in_Fp2_sub(t1, t2, z + g3);
2456 in_Fp2_add(t1, t1, t1);
2460 lea(gp2, ptr [z + g3]);
2461 call(p_Fp2Dbl_square);
2465 in_Fp2_add(z + g3, t1, t2);
2468 in_Fp2Dbl_mul_xi(T0, T1);
2473 in_FpDbl_add(2, T0, T0, T2);
2482 for (
int i = 0; i < 2; i++) {
2484 load_add_rm(gt4, gt3, gt2, gt1, (
RegExp)t0 +
sizeof(
Fp) * i, rax,
false);
2485 sub_rm(gt4, gt3, gt2, gt1, z + g4 +
sizeof(
Fp) * i);
2486 add_rr(gt4, gt3, gt2, gt1, gt4, gt3, gt2, gt1);
2487 add_rm(gt4, gt3, gt2, gt1, (
RegExp)t0 +
sizeof(
Fp) * i);
2488 fast_modp(gt4, gt3, gt2, gt1, gp1, gp2, gp3, gt5);
2489 store_mr(z + g4 +
sizeof(
Fp) * i, gt4, gt3, gt2, gt1);
2494 in_Fp2_sub(z + g4, t0, z + g4);
2497 in_Fp2_add(z + g4, z + g4, z + g4);
2500 in_Fp2_add(z + g4, z + g4, t0);
2504 in_FpDbl_addNC(2, T2, T2, T1);
2507 in_Fp2Dbl_sub(T3, T3, T2);
2517 lea(gp1, ptr [z + g5]);
2519 call(p_Fp2_2z_add_3x);
2521 in_Fp2_add(z + g5, z + g5, t0);
2524 in_Fp2_add(z + g5, z + g5, z + g5);
2526 in_Fp2_add(z + g5, z + g5, t0);
2529 sub(qword [rsp + nsave], 1);
2538 void in_Fp_2z_add_3x(
const RegExp& mz,
const RegExp& mx)
2540 load_add_rm(gt4, gt3, gt2, gt1, mz, mx,
false);
2541 add_rr(gt4, gt3, gt2, gt1, gt4, gt3, gt2, gt1);
2542 add_rm(gt4, gt3, gt2, gt1, mx);
2543 fast_modp(gt4, gt3, gt2, gt1, gp3, gt5, gt6, gt7);
2544 store_mr(mz, gt4, gt3, gt2, gt1);
2551 void set_p_Fp2_2z_add_3x()
2555 in_Fp_2z_add_3x(gp1, gp2);
2556 in_Fp_2z_add_3x(gp1 +
sizeof(
Fp), gp2 +
sizeof(
Fp));
2562 const int b =
sizeof(
Fp2);
2563 const int c =
sizeof(
Fp2) * 2;
2564 in_Fp2_mul_xi(mz +
a, mx + c);
2565 in_Fp2_add(mz +
a, mz +
a, my +
a);
2566 in_Fp2_add(mz + b, mx +
a, my + b);
2567 in_Fp2_add(mz + c, mx + b, my + c);
2570 void make_Fp12_square()
2573 const Ext6<Fp> t0(rsp);
2574 const Ext6<Fp> t1(rsp, t0.next);
2575 const int zsave = t1.next;
2576 const int SS = zsave + 8;
2577 const Ext12<Fp> z(gt10);
2578 MakeStackFrame<> sf(
this, 10, SS / 8);
2581 mov(ptr [rsp + zsave], gp1);
2583 lea(gp2, ptr [z.a_]);
2584 lea(gp3, ptr [z.b_]);
2587 sub_Fp2_mul_gamma_add(t1, z.b_, z.a_);
2589 lea(gp1, ptr [z.b_]);
2593 mov(gp1, ptr [rsp + zsave]);
2598 mov(z.r_, ptr [rsp + zsave]);
2599 sub_Fp2_mul_gamma_add(t1, z.b_, z.b_);
2601 mov(z.r_, ptr [rsp + zsave]);
2607 lea(gp1, ptr [z.b_]);
2613 void make_Fp12_mul()
2615 const Ext12<Fp> z(gt8);
2616 const Ext12<Fp> x(gt9);
2617 const Ext12<Fp>
y(gt10);
2619 const Ext6<Fp> t0(rsp);
2620 const Ext6<Fp> t1(rsp, t0.next);
2621 const Ext6<FpDbl> T0(rsp, t1.next);
2622 const Ext6<FpDbl> T1(rsp, T0.next);
2623 const Ext6<FpDbl> T2(rsp, T1.next);
2624 const Ext12<FpDbl> zd(rsp, T2.next);
2625 const Ext1<uint64> zsave(rsp, zd.next);
2626 const Ext1<uint64> xsave(rsp, zsave.next);
2627 const Ext1<uint64> ysave(rsp, xsave.next);
2628 const int SS = ysave.next;
2629 MakeStackFrame<> sf(
this, 10, SS / 8);
2630 mov(ptr [zsave], gp1);
2631 mov(ptr [xsave], gp2);
2632 mov(ptr [ysave], gp3);
2642 mov(x.r_, ptr [xsave]);
2643 mov(
y.r_, ptr [ysave]);
2646 lea(gp2, ptr [x.b_]);
2647 lea(gp3, ptr [
y.b_]);
2654 mov(x.r_, ptr [xsave]);
2656 lea(gp2, ptr [x.a_]);
2657 lea(gp3, ptr [x.b_]);
2661 mov(
y.r_, ptr [ysave]);
2663 lea(gp2, ptr [
y.a_]);
2664 lea(gp3, ptr [
y.b_]);
2668 lea(gp1, ptr [zd.a_]);
2677 in_FpDbl_add(6, T2, T0, T1);
2680 in_FpDbl_sub(6, zd.b_, zd.a_, T2);
2683 in_Fp2Dbl_mul_xi(zd.a_.a_, T1.c_);
2684 in_FpDbl_add(2, zd.a_.a_, zd.a_.a_, T0.a_);
2685 in_FpDbl_add(2, zd.a_.b_, T1.a_, T0.b_);
2686 in_FpDbl_add(2, zd.a_.c_, T1.b_, T0.c_);
2689 for (
int i = 0; i < 12; i++) {
2690 mov(gp1, ptr [zsave]);
2691 if (i > 0)
add(gp1,
sizeof(
Fp) * i);
2697 void make_Fp12Dbl_mul_Fp2_024()
2703 const Ext2<Fp> t0(rsp);
2704 const Ext2<Fp> t1(rsp, t0.next);
2705 const Ext2<Fp> t2(rsp, t1.next);
2706 const Ext2<Fp> t4(rsp, t2.next);
2707 const Ext2<FpDbl> T2(rsp, t4.next);
2708 const Ext2<FpDbl> T3(rsp, T2.next);
2709 const Ext2<FpDbl> X0T0(rsp, T3.next);
2710 const Ext2<FpDbl> X2T2(rsp, X0T0.next);
2711 const Ext2<FpDbl> X4T4(rsp, X2T2.next);
2712 const Ext2<FpDbl> ACC(rsp, X4T4.next);
2713 const int SS = ACC.next;
2714 const Ext12<Fp> z(gt9);
2715 const Ext6<Fp> x(gt10);
2717 MakeStackFrame<> sf(
this, 10, SS / 8);
2726 lea(gp1, ptr [X0T0]);
2729 call(p_Fp2Dbl_mulOpt2);
2734 lea(gp1, ptr [X2T2]);
2735 lea(gp2, ptr [z.a_.c_]);
2736 lea(gp3, ptr [x.c_]);
2737 call(p_Fp2Dbl_mulOpt2);
2742 lea(gp1, ptr [X4T4]);
2743 lea(gp2, ptr [z.b_.b_]);
2744 lea(gp3, ptr [x.b_]);
2745 call(p_Fp2Dbl_mulOpt2);
2749 in_Fp2_add(t2, z.a_.a_, z.b_.b_);
2752 in_Fp2_add(t1, z.a_.a_, z.a_.c_);
2755 in_Fp2_add(t4, z.a_.b_, z.b_.a_);
2758 in_Fp2_add(t4, t4, z.b_.c_);
2762 lea(gp1, ptr [ACC]);
2763 lea(gp2, ptr [z.a_.b_]);
2764 lea(gp3, ptr [x.c_]);
2765 call(p_Fp2Dbl_mulOpt2);
2768 in_Fp2Dbl_add(T2, ACC, X4T4);
2771 in_Fp2Dbl_mul_xi(T3, T2);
2774 in_Fp2Dbl_add(T3, T3, X0T0);
2778 lea(gp1, ptr [z.a_.a_]);
2786 lea(gp2, ptr [z.b_.c_]);
2787 lea(gp3, ptr [x.b_]);
2788 call(p_Fp2Dbl_mulOpt2);
2791 in_Fp2Dbl_add(ACC, ACC, T2);
2794 in_Fp2Dbl_add(T2, T2, X2T2);
2797 in_Fp2Dbl_mul_xi(T3, T2);
2803 lea(gp2, ptr [z.a_.b_]);
2804 lea(gp3, ptr [x.a_]);
2805 call(p_Fp2Dbl_mulOpt2);
2808 in_Fp2Dbl_add(ACC, ACC, T2);
2811 in_Fp2Dbl_add(T3, T3, T2);
2815 lea(gp1, ptr [z.a_.b_]);
2821 in_Fp2_add(t0, x.a_, x.c_);
2827 call(p_Fp2Dbl_mulOpt2);
2830 in_Fp2Dbl_sub(T2, T2, X0T0);
2833 in_Fp2Dbl_sub(T2, T2, X2T2);
2839 lea(gp2, ptr [z.b_.a_]);
2840 lea(gp3, ptr [x.b_]);
2841 call(p_Fp2Dbl_mulOpt2);
2844 in_Fp2Dbl_add(ACC, ACC, T3);
2847 in_Fp2Dbl_add(T2, T2, T3);
2851 in_Fp2_add(t0, z.a_.c_, z.b_.b_);
2854 lea(gp1, ptr [z.a_.c_]);
2860 in_Fp2_add(t1, x.c_, x.b_);
2866 call(p_Fp2Dbl_mulOpt2);
2870 in_Fp2Dbl_sub(T2, T2, X2T2);
2873 in_Fp2Dbl_sub(T2, T2, X4T4);
2876 in_Fp2Dbl_mul_xi(T3, T2);
2882 lea(gp2, ptr [z.b_.a_]);
2884 call(p_Fp2Dbl_mulOpt2);
2887 in_Fp2Dbl_add(ACC, ACC, T2);
2890 in_Fp2Dbl_add(T3, T3, T2);
2895 lea(gp1, ptr [z.b_.a_]);
2903 lea(gp2, ptr [z.b_.c_]);
2904 lea(gp3, ptr [x.c_]);
2905 call(p_Fp2Dbl_mulOpt2);
2908 in_Fp2Dbl_add(ACC, ACC, T2);
2911 in_Fp2Dbl_mul_xi(T3, T2);
2915 in_Fp2_add(t0, x.a_, x.b_);
2921 call(p_Fp2Dbl_mulOpt2);
2924 in_Fp2Dbl_sub(T2, T2, X0T0);
2927 in_Fp2Dbl_sub(T2, T2, X4T4);
2930 in_Fp2Dbl_add(T3, T3, T2);
2934 lea(gp1, ptr [z.b_.b_]);
2940 in_Fp2_add(t0, x.a_, x.c_);
2943 in_Fp2_add(t0, t0, x.b_);
2949 call(p_Fp2Dbl_mulOpt2);
2953 in_Fp2Dbl_sub(T2, T2, ACC);
2956 lea(gp1, ptr [z.b_.c_]);
2965 void set_p_Fp2Dbl_square()
2970 const Ext2<FpDbl> z(gp1);
2971 const Ext2<Fp> x(gp2);
2973 const Ext1<Fp> t0(rsp);
2974 const Ext1<Fp> t1(rsp, t0.next);
2975 const int SS = t1.next;
2977 const Reg64& gt0 = gp3;
2978 const Reg64&
a = rax;
2982 load_rm(gt3, gt2, gt1, gt0, x.b_);
2983 add_rr(gt3, gt2, gt1, gt0, gt3, gt2, gt1, gt0);
2984 store_mr(t0, gt3, gt2, gt1, gt0);
2986 mul4x4(z.b_, t0, x.a_, gt9, gt8, gt7, gt6, gt5, gt4, gt3, gt2, gt1, gt0);
2991 load_add_rm(gt3, gt2, gt1, gt0, x.a_,
a,
false);
2992 sub_rm(gt3, gt2, gt1, gt0, x.b_);
2993 store_mr(t1, gt3, gt2, gt1, gt0);
2996 in_Fp_addNC(t0, x.a_, x.b_);
2998 mul4x4(z, t0, t1, gt9, gt8, gt7, gt6, gt5, gt4, gt3, gt2, gt1, gt0);
3006 void make_Fp2Dbl_square()
3008 MakeStackFrame<> sf(
this, 9);
3009 call(p_Fp2Dbl_square);
3012 void make_pointDblLineEval(
bool withoutP)
3016 const Ext2<Fp> t0(rsp);
3017 const Ext2<Fp> t1(rsp, t0.next);
3018 const Ext2<Fp> t2(rsp, t1.next);
3019 const Ext2<Fp> t3(rsp, t2.next);
3020 const Ext2<Fp> t4(rsp, t3.next);
3021 const Ext2<Fp> t5(rsp, t4.next);
3022 const Ext2<FpDbl> T0(rsp, t5.next);
3023 const Ext2<FpDbl> T1(rsp, T0.next);
3024 const Ext2<FpDbl> T2(rsp, T1.next);
3025 const int SS = T2.next;
3026 const Ext6<Fp>
l(gt8);
3027 const Reg64&
R = gt9;
3028 const Reg64&
P = gt10;
3030 MakeStackFrame<> sf(
this, 10, SS / 8);
3042 lea(gp2, ptr [
R +
sizeof(
Fp2) * 2]);
3048 lea(gp2, ptr [
R +
sizeof(
Fp2) * 0]);
3049 lea(gp3, ptr [
R +
sizeof(
Fp2) * 1]);
3055 lea(gp2, ptr [
R +
sizeof(
Fp2) * 1]);
3059 in_Fp2_add(t3, t0, t0);
3067 in_Fp2_add(t5, t0, t1);
3070 in_Fp2_add(t0, t0, t3);
3072#ifdef BN_SUPPORT_SNARK
3076 in_Fp_mul_xi_addsub(t2, t0, t0 + 32,
true);
3077 in_Fp_mul_xi_addsub(t2 + 32, t0 + 32, t0,
false);
3086 in_Fp_add(t2.a_, t0.a_, t0.b_);
3089 in_Fp_sub(t2.b_, t0.b_, t0.a_);
3098 in_Fp2_add(t3, t2, t2);
3101 in_Fp2_add(t3, t3, t2);
3105 in_Fp2_addNC(
l.c_, t0, t0);
3109 in_Fp2_sub(
R, t1, t3);
3112 in_Fp2_addNC(
l.c_,
l.c_, t0);
3115 in_Fp2_add(t3, t3, t1);
3131 call(p_Fp2Dbl_square);
3136 call(p_Fp2Dbl_square);
3139 in_FpDbl_addNC(2, T2, T1, T1);
3143 in_Fp2_add(t3,
R +
sizeof(
Fp2) * 1,
R +
sizeof(
Fp2) * 2);
3146#ifdef BN_SUPPORT_SNARK
3147 in_FpDbl_add(2, T2, T2, T1);
3149 in_FpDbl_addNC(2, T2, T2, T1);
3158 in_Fp2_sub(t3, t3, t5);
3161 in_FpDbl_sub(2, T0, T0, T2);
3165 lea(gp1, ptr [
R +
sizeof(
Fp2) * 1]);
3171 lea(gp1, ptr [
R +
sizeof(
Fp2) * 2]);
3177 in_Fp2_sub(t2, t2, t1);
3181 in_Fp2_mul_xi(
l, t2);
3185 in_Fp2_neg(
l.b_, t3);
3186 if (withoutP)
return;
3189 lea(gp1, ptr [
l.c_]);
3194 lea(gp1, ptr [
l.c_.b_]);
3203 lea(gp1, ptr [
l.b_]);
3205 lea(gp3, ptr [
P +
sizeof(
Fp) * 1]);
3209 lea(gp1, ptr [
l.b_.b_]);
3211 lea(gp3, ptr [
P +
sizeof(
Fp) * 1]);
3216 PairingCode(
size_t size,
void *userPtr)
3217 :
Xbyak::CodeGenerator(size, userPtr)
3257 int storeReg(
int gtn,
int numQword = 0)
3259 const Reg64
tbl[] = {
3260 gt3, gt4, gt5, gt6, gt7, gt8, gt9, gt10
3262 assert(0 <= gtn && gtn <= 10);
3265 const int P = 8 * (std::max(0, gtn - 6) + numQword);
3266 if (
P > 0)
sub(rsp,
P);
3267 for (
int i = 3; i <= std::min(gtn, 6); i++) {
3268 mov(ptr [rsp +
P + (i - 2) * 8],
tbl[i - 3]);
3270 for (
int i = 7; i <= gtn; i++) {
3271 mov(ptr [rsp +
P - 8 * (i - 6)],
tbl[i - 3]);
3274 const int P = 8 * (std::max(0, gtn - 4) + numQword);
3275 if (
P > 0)
sub(rsp,
P);
3276 for (
int i = 5; i <= gtn; i++) {
3277 mov(ptr [rsp +
P - 8 * (i - 4)],
tbl[i - 3]);
3286 void restoreReg(
int P)
3288 const Reg64
tbl[] = {
3289 gt3, gt4, gt5, gt6, gt7, gt8, gt9, gt10
3291 assert(0 <= gtn_ && gtn_ <= 10);
3293 for (
int i = 3; i <= std::min(gtn_, 6); i++) {
3294 mov(
tbl[i - 3], ptr [rsp +
P + (i - 2) * 8]);
3296 for (
int i = 7; i <= gtn_; i++) {
3297 mov(
tbl[i - 3], ptr [rsp +
P - 8 * (i - 6)]);
3300 for (
int i = 5; i <= gtn_; i++) {
3301 mov(
tbl[i - 3], ptr [rsp +
P - 8 * (i - 4)]);
3304 if (
P > 0)
add(rsp,
P);
3308 detectCpu(mode, useMulx);
3311 const size_t N = 64;
3329 set_p_Fp2_2z_add_3x();
3332 set_p_FpDbl_addNC();
3333 set_p_FpDbl_subNC();
3334 set_p_Fp2Dbl_mul_xi();
3335 set_p_Fp2Dbl_mulOpt(1);
3336 set_p_Fp2Dbl_mulOpt(2);
3337 set_p_Fp2Dbl_square();
3344 typedef void (*opFpx2)(
Fp&,
const Fp&);
3345 typedef void (*opFpx3)(
Fp&,
const Fp&,
const Fp&);
3389 make_FpDbl_addNC(1);
3401 make_FpDbl_subNC(1);
3412 typedef void (*opFp2x2)(
Fp2&,
const Fp2&);
3413 typedef void (*opFp2x3)(
Fp2&,
const Fp2&,
const Fp2&);
3441 make_Fp2_mul_Fp_0();
3455 make_FpDbl_addNC(2);
3467 make_FpDbl_subNC(2);
3471 make_Fp2Dbl_mulOpt(1);
3475 make_Fp2Dbl_mulOpt(2);
3479 make_Fp2Dbl_square();
3487 make_Fp2Dbl_mul_xi();
3490 typedef void (*opFp6x3)(
Fp6&,
const Fp6&,
const Fp6&);
3502 make_pointDblLineEval(
false);
3505 make_pointDblLineEval(
true);
3517 make_Compress_square_n();
3529 make_Fp12Dbl_mul_Fp2_024();
3543 void *p_Fp2_2z_add_3x;
3546 void *p_FpDbl_addNC;
3547 void *p_FpDbl_subNC;
3549 void *p_Fp2Dbl_mulOpt1;
3550 void *p_Fp2Dbl_mulOpt2;
3551 void *p_Fp2Dbl_square;
3552 void *p_Fp2Dbl_mul_xi;
3579 assert((
p[0] & 0x1) == 1);
3580 halfTbl_[0].clear();
3584 assert((
p[0] & 0x3) == 3);
3585 quarterTbl_[0].clear();
3595 puts(
"DEBUG_COUNT mode on!!!");
3597#ifdef BN_SUPPORT_SNARK
3598 const bool scipr =
true;
3600 const bool scipr =
false;
3602 if (scipr != definedBN_SUPPORT_SNARK) {
3603 fprintf(stderr,
"use -DBN_SUPPORT_SNARK for all sources\n");
3606 static bool init =
false;
3610 mie::local::errExit(
"not support p for Fp::setModulo");
3617 ZN::setModulo(
Vuint(1) << (
sizeof(
Unit) * 8));
3623 p_add1_div4_ = (
p + 1) / 4;
3627 montgomeryR_ = (
Vuint(1) << 256) %
p;
3640 const int PageSize = 4096;
3641 const size_t codeSize = PageSize * 9;
3642 const size_t dataSize = PageSize * 1;
3644 static std::vector<Xbyak::uint8>
buf;
3645 buf.resize(codeSize + dataSize + PageSize);
3651 if ((
size_t)codeAddr & 0xffffffff00000000ULL || (
size_t)
s_data & 0xffffffff00000000ULL) {
3661 for (
size_t i = 0; i <
pTblSize; i++) {
3670 for (
size_t h = 1; h <
pNtblSize; ++h) {
3671 Fp::Dbl::pNTbl_[h].setDirect(pN >> h);
3676 static PairingCode code(codeSize, codeAddr);
3677 code.init(p_, mode, useMulx);
3680 for (
int i = 0; i < 512; i++) {
3681 invTbl_[511 - i] = t;
3686 }
catch (std::exception& e) {
3687 fprintf(stderr,
"setModulo ERR:%s\n", e.what());
static uint8 * getAlignedAddress(uint8 *addr, size_t alignedSize=16)
static bool protect(const void *addr, size_t size, bool canExec)
const uint8 * getCurr() const
void shr(const Operand &op, const Reg8 &_cl)
void shl(const Operand &op, const Reg8 &_cl)
void call(const Operand &op)
void and_(const Operand &op, uint32 imm)
void xor_(const Operand &op, uint32 imm)
void paddd(const Mmx &mmx, const Operand &op)
void shrd(const Operand &op, const Reg ®, const Reg8 &_cl)
void align(size_t x=16, bool useMultiByteNop=true)
void jmp(const Operand &op)
void inc(const Operand &op)
void mulx(const Reg32e &r1, const Reg32e &r2, const Operand &op)
void test(const Operand &op, const Reg ®)
void add(const Operand &op, uint32 imm)
void sub(const Operand &op, uint32 imm)
void movq(const Address &addr, const Mmx &mmx)
void cmovc(const Reg ®, const Operand &op)
void shld(const Operand &op, const Reg ®, const Reg8 &_cl)
void adc(const Operand &op, uint32 imm)
void jz(const Label &label, LabelType type=T_AUTO)
void jnc(const Label &label, LabelType type=T_AUTO)
void pxor(const Mmx &mmx, const Operand &op)
void jc(const Label &label, LabelType type=T_AUTO)
void mov(const Operand ®1, const Operand ®2)
void or_(const Operand &op, uint32 imm)
void pop(const Operand &op)
void mul(const Operand &op)
void lea(const Reg ®, const Address &addr)
void jnz(const Label &label, LabelType type=T_AUTO)
void L(const std::string &label)
void push(const Operand &op)
void sbb(const Operand &op, uint32 imm)
static void getCpuid(unsigned int eaxIn, unsigned int data[4])
bool has(Type type) const
static void(* add)(Fp &out, const Fp &x, const Fp &y)
static void(* addNC)(Fp &out, const Fp &x, const Fp &y)
static void(* shr2)(Fp &out, const Fp &x)
static void(* mul)(Fp &out, const Fp &x, const Fp &y)
static int(* preInv)(Fp &r, const Fp &x)
static mie::Fp * halfTbl_
static const mie::Vuint & getModulo()
static const Fp & getDirectP(int n)
static void(* neg)(Fp &out, const Fp &x)
static void(* sub)(Fp &out, const Fp &x, const Fp &y)
static void(* subNC)(Fp &out, const Fp &x, const Fp &y)
static void(* shr1)(Fp &out, const Fp &x)
static MIE_FORCE_INLINE void setDirect(Fp &out, const T &in)
static void setModulo(const mie::Vuint &p, int mode, bool useMulx=true, bool definedBN_SUPPORT_SNARK=false)
static void setModulo(const V &m)
LOGGING_API void printf(Category category, const char *format,...)
uint32_t next(octet_iterator &it, octet_iterator end)
const GenericPointer< typename T::ValueType > T2 T::AllocatorType & a
unsigned __int64 uint64_t
static void(* square_n)(CompressT &z, int n)
static void(* mul_Fp2_024)(Fp12T &z, const Fp6 &x)
static void(* mul)(Fp12T &z, const Fp12T &x, const Fp12T &y)
static void(* square)(Fp12T &z)
static void(* mod)(Fp2T &z, const Dbl &x)
static void(* square)(Dbl &z, const Fp2T &x)
static void(* mulOpt1)(Dbl &z, const Fp2T &x, const Fp2T &y)
static void(* mulOpt2)(Dbl &z, const Fp2T &x, const Fp2T &y)
static void(* square)(Fp2T &z, const Fp2T &x)
static void(* addNC)(Fp2T &z, const Fp2T &x, const Fp2T &y)
static void(* divBy2)(Fp2T &z, const Fp2T &x)
static void(* mul_Fp_0)(Fp2T &z, const Fp2T &x, const Fp &b)
static void(* sub)(Fp2T &z, const Fp2T &x, const Fp2T &y)
static void(* mul_xi)(Fp2T &z, const Fp2T &x)
static void(* add)(Fp2T &z, const Fp2T &x, const Fp2T &y)
static void(* mul)(Fp2T &z, const Fp2T &x, const Fp2T &y)
static void(* mul)(Dbl &, const Fp6T &x, const Fp6T &y)
static void(* pointDblLineEvalWithoutP)(Fp6T &l, Fp2 *R)
static void(* pointDblLineEval)(Fp6T &l, Fp2 *R, const Fp *P)
static void(* sub)(Fp6T &z, const Fp6T &x, const Fp6T &y)
static void(* add)(Fp6T &z, const Fp6T &x, const Fp6T &y)
static void(* mul)(Fp6T &z, const Fp6T &x, const Fp6T &y)
static MIE_FORCE_INLINE void setDirect(Dbl &out, const mie::Vuint &in)
const Unit * const_ptr() const
static void(* mod)(Fp &z, const Dbl &x)
void bin_op(Dbl &z, const Dbl &x, const Dbl &y)
static void(* mul)(Dbl &z, const Fp &x, const Fp &y)
void uni_op(Dbl &z, const Dbl &x)
Xbyak ; JIT assembler for x86(IA32)/x64 by C++.
c_gkp_out sizeof(template))
mie::ZmZ< mie::Vuint, Fp > Fp_emu