Wire Sysio Wire Sysion 1.0.0
Loading...
Searching...
No Matches
make_512.cpp
Go to the documentation of this file.
1#include <stdio.h>
2#include "xbyak/xbyak.h"
3#include <stdlib.h>
4#include <string.h>
5#include "cybozu/inttype.hpp"
6#define NUM_OF_ARRAY(x) (sizeof(x) / sizeof(x[0]))
7
8using namespace Xbyak;
9
10const int bitEnd = 64;
11
12const uint64 YMM_SAE = 1ULL << 0;
13const uint64 _XMM = 1ULL << 1;
14const uint64 _MEM = 1ULL << 2;
15const uint64 _REG32 = 1ULL << 3;
16const uint64 EAX = 1ULL << 4;
17const uint64 IMM32 = 1ULL << 5;
18const uint64 IMM8 = 1ULL << 6;
19const uint64 _REG8 = 1ULL << 7;
20const uint64 _REG16 = 1ULL << 8;
21const uint64 XMM_K = 1ULL << 9;
22const uint64 YMM_K = 1ULL << 10;
23const uint64 ZMM_K = 1ULL << 11;
24const uint64 AX = 1ULL << 12;
25const uint64 AL = 1ULL << 13;
26const uint64 IMM_1 = 1ULL << 14;
27const uint64 MEM8 = 1ULL << 15;
28const uint64 MEM16 = 1ULL << 16;
29const uint64 MEM32 = 1ULL << 17;
30const uint64 VM32Z = 1ULL << 19;
31const uint64 K_K = 1ULL << 20;
32const uint64 MEM_ONLY_DISP = 1ULL << 21;
33const uint64 VM32X_K = 1ULL << 23;
34const uint64 _YMM = 1ULL << 24;
35const uint64 VM32X_32 = 1ULL << 39;
36const uint64 VM32X_64 = 1ULL << 40;
37const uint64 VM32Y_32 = 1ULL << 41;
38const uint64 VM32Y_64 = 1ULL << 42;
39const uint64 VM32Z_K = 1ULL << 32;
40#ifdef XBYAK64
41const uint64 _MEMe = 1ULL << 25;
42const uint64 REG32_2 = 1ULL << 26; // r8d, ...
43const uint64 REG16_2 = 1ULL << 27; // r8w, ...
44const uint64 REG8_2 = 1ULL << 28; // r8b, ...
45const uint64 REG8_3 = 1ULL << 29; // spl, ...
46const uint64 _REG64 = 1ULL << 30; // rax, ...
47const uint64 _REG64_2 = 1ULL << 31; // r8, ...
48const uint64 _XMM2 = 1ULL << 33;
49const uint64 _YMM2 = 1ULL << 34;
52#else
53const uint64 _MEMe = 0;
54const uint64 REG32_2 = 0;
55const uint64 REG16_2 = 0;
56const uint64 REG8_2 = 0;
57const uint64 REG8_3 = 0;
58const uint64 _REG64 = 0;
59const uint64 _REG64_2 = 0;
60const uint64 _XMM2 = 0;
61const uint64 _YMM2 = 0;
64#endif
70const uint64 MEM = _MEM | _MEMe;
71const uint64 MEM64 = 1ULL << 35;
72const uint64 YMM_ER = 1ULL << 36;
73const uint64 VM32Y_K = 1ULL << 37;
74const uint64 IMM_2 = 1ULL << 38;
76const uint64 XMM = _XMM | _XMM2;
77const uint64 YMM = _YMM | _YMM2;
78const uint64 K = 1ULL << 43;
79const uint64 _ZMM = 1ULL << 44;
80const uint64 _ZMM2 = 1ULL << 45;
81#ifdef XBYAK64
82const uint64 ZMM = _ZMM | _ZMM2;
83const uint64 _YMM3 = 1ULL << 46;
84#else
85const uint64 ZMM = _ZMM;
86const uint64 _YMM3 = 0;
87#endif
88const uint64 K2 = 1ULL << 47;
89const uint64 ZMM_SAE = 1ULL << 48;
90const uint64 ZMM_ER = 1ULL << 49;
91#ifdef XBYAK64
92const uint64 _XMM3 = 1ULL << 50;
93#endif
94const uint64 XMM_SAE = 1ULL << 51;
95#ifdef XBYAK64
96const uint64 XMM_KZ = 1ULL << 52;
97const uint64 YMM_KZ = 1ULL << 53;
98const uint64 ZMM_KZ = 1ULL << 54;
99#else
100const uint64 XMM_KZ = 0;
101const uint64 YMM_KZ = 0;
102const uint64 ZMM_KZ = 0;
103#endif
104const uint64 MEM_K = 1ULL << 55;
105const uint64 M_1to2 = 1ULL << 56;
106const uint64 M_1to4 = 1ULL << 57;
107const uint64 M_1to8 = 1ULL << 58;
108const uint64 M_1to16 = 1ULL << 59;
109const uint64 XMM_ER = 1ULL << 60;
110const uint64 M_xword = 1ULL << 61;
111const uint64 M_yword = 1ULL << 62;
112const uint64 MY_1to4 = 1ULL << 18;
113
114const uint64 NOPARA = 1ULL << (bitEnd - 1);
115
116class Test {
117 Test(const Test&);
118 void operator=(const Test&);
119 const bool isXbyak_;
120 int funcNum_;
121 // check all op1, op2, op3
122 void put(const std::string& nm, uint64 op1 = NOPARA, uint64 op2 = NOPARA, uint64 op3 = NOPARA, uint64 op4 = NOPARA) const
123 {
124 for (int i = 0; i < bitEnd; i++) {
125 if ((op1 & (1ULL << i)) == 0) continue;
126 for (int j = 0; j < bitEnd; j++) {
127 if ((op2 & (1ULL << j)) == 0) continue;
128 for (int k = 0; k < bitEnd; k++) {
129 if ((op3 & (1ULL << k)) == 0) continue;
130 for (int s = 0; s < bitEnd; s++) {
131 if ((op4 & (1ULL << s)) == 0) continue;
132 printf("%s ", nm.c_str());
133 if (isXbyak_) printf("(");
134 if (!(op1 & NOPARA)) printf("%s", get(1ULL << i));
135 if (!(op2 & NOPARA)) printf(", %s", get(1ULL << j));
136 if (!(op3 & NOPARA)) printf(", %s", get(1ULL << k));
137 if (!(op4 & NOPARA)) printf(", %s", get(1ULL << s));
138 if (isXbyak_) printf("); dump();");
139 printf("\n");
140 }
141 }
142 }
143 }
144 }
145 void put(const char *nm, uint64 op, const char *xbyak, const char *nasm) const
146 {
147 for (int i = 0; i < bitEnd; i++) {
148 if ((op & (1ULL << i)) == 0) continue;
149 printf("%s ", nm);
150 if (isXbyak_) printf("(");
151 if (!(op & NOPARA)) printf("%s", get(1ULL << i));
152 printf(", %s", isXbyak_ ? xbyak : nasm);
153 if (isXbyak_) printf("); dump();");
154 printf("\n");
155 }
156 }
157 void put(const char *nm, const char *xbyak, const char *nasm = 0, uint64 op = NOPARA) const
158 {
159 if (nasm == 0) nasm = xbyak;
160 for (int i = 0; i < bitEnd; i++) {
161 if ((op & (1ULL << i)) == 0) continue;
162 printf("%s ", nm);
163 if (isXbyak_) printf("(");
164 printf("%s ", isXbyak_ ? xbyak : nasm);
165 if (!(op & NOPARA)) printf(", %s", get(1ULL << i));
166 if (isXbyak_) printf("); dump();");
167 printf("\n");
168 }
169 }
170 const char *get(uint64 type) const
171 {
172 int idx = (rand() / 31) & 7;
173 switch (type) {
174 case _XMM:
175 {
176 static const char tbl[][6] = {
177 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
178 };
179 return tbl[idx];
180 }
181 case _YMM:
182 {
183 static const char tbl[][6] = {
184 "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7"
185 };
186 return tbl[idx];
187 }
188 case _ZMM:
189 {
190 static const char tbl[][6] = {
191 "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7"
192 };
193 return tbl[idx];
194 }
195#ifdef XBYAK64
196 case _XMM2:
197 {
198 static const char tbl[][6] = {
199 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15"
200 };
201 return tbl[idx];
202 }
203 case _XMM3:
204 {
205 static const char tbl[][6] = {
206 "xmm16", "xmm17", "xmm18", "xmm19", "xmm20", "xmm21", "xmm22", "xmm23"
207 };
208 return tbl[idx];
209 }
210 case _YMM2:
211 {
212 static const char tbl[][6] = {
213 "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", "ymm14", "ymm15",
214 };
215 return tbl[idx];
216 }
217 case _YMM3:
218 {
219 static const char tbl[][6] = {
220 "ymm16", "ymm17", "ymm18", "ymm19", "ymm20", "ymm21", "ymm22", "ymm23",
221 };
222 return tbl[idx];
223 }
224 case _ZMM2:
225 {
226 static const char tbl[][6] = {
227 "zmm8", "zmm9", "zmm10", "zmm11", "zmm28", "zmm29", "zmm30", "zmm31",
228 };
229 return tbl[idx];
230 }
231#endif
232 case _MEM:
233 return isXbyak_ ? "ptr[eax+ecx+64]" : "[eax+ecx+64]"; // QQQ
234// return isXbyak_ ? "ptr[eax+ecx+6]" : "[eax+ecx+6]";
235 case _MEMe:
236 {
237 static int ccc = 1;
238#ifdef USE_YASM
239 ccc++;
240#endif
241 if (ccc & 1) {
242 return isXbyak_ ? "ptr[rdx+r15+0x12]" : "[rdx+r15+0x12]";
243 } else {
244 return isXbyak_ ? "ptr[rip - 0x13456+1-3]" : "[rip - 0x13456+1-3]";
245 }
246 }
247 case MEM8:
248 return "byte [eax+edx]";
249 case MEM16:
250 return "word [esi]";
251 case MEM32:
252 return "dword [eax+64]";
253 case MEM64:
254 return "qword [rax+64]";
255 case MEM_ONLY_DISP:
256 return isXbyak_ ? "ptr[(void*)0x123]" : "[0x123]";
257 case _REG16: // not ax
258 {
259 static const char Reg16Tbl[][4] = {
260 "ax", "cx", "dx", "bx", "sp", "bp", "si", "di"
261 };
262 return Reg16Tbl[(idx % 7) + 1];
263 }
264 case _REG8: // not al
265 {
266 static const char Reg8Tbl[][4] = {
267#ifdef XBYAK64 // QQQ
268 "al", "cl", "dl", "bl", "al", "cl", "dl", "bl"
269#else
270 "al", "cl", "dl", "bl", "ah", "ch", "dh", "bh"
271#endif
272 };
273 return Reg8Tbl[(idx % 7) + 1];
274 }
275 case _REG32: // not eax
276 {
277 static const char Reg32Tbl[][4] = {
278 "eax", "ecx", "edx", "ebx", "esp", "ebp", "esi", "edi"
279 };
280 return Reg32Tbl[(idx % 7) + 1];
281 }
282#ifdef XBYAK64
283 case _REG64: // not rax
284 {
285 static const char Reg64Tbl[][4] = {
286 "rax", "rcx", "rdx", "rbx", "rsp", "rbp", "rsi", "rdi"
287 };
288 return Reg64Tbl[(idx % 7) + 1];
289 }
290 case _REG64_2:
291 {
292 static const char Reg64_2Tbl[][4] = {
293 "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
294 };
295 return Reg64_2Tbl[idx];
296 }
297 case REG32_2:
298 {
299 static const char Reg32eTbl[][5] = {
300 "r8d", "r9d", "r10d", "r11d", "r12d", "r13d", "r14d", "r15d"
301 };
302 return Reg32eTbl[idx];
303 }
304 case REG16_2:
305 {
306 static const char Reg16eTbl[][5] = {
307 "r8w", "r9w", "r10w", "r11w", "r12w", "r13w", "r14w", "r15w"
308 };
309 return Reg16eTbl[idx];
310 }
311 case REG8_2:
312 {
313 static const char Reg8_2Tbl[][5] = {
314 "r8b", "r9b", "r10b", "r11b", "r12b", "r13b", "r14b", "r15b"
315 };
316 return Reg8_2Tbl[idx];
317 }
318 case REG8_3:
319 {
320 static const char Reg8_3Tbl[][5] = {
321 "spl", "bpl", "sil", "dil", "spl", "bpl", "sil", "dil"
322 };
323 return Reg8_3Tbl[idx];
324 }
325#endif
326 case EAX:
327 return "eax";
328 case AX:
329 return "ax";
330 case AL:
331 return "al";
332 case K_K:
333 return isXbyak_ ? "k5 | k3" : "k5{k3}";
334 case IMM32:
335 return isXbyak_ ? "12345678" : "dword 12345678";
336 case IMM8:
337 return isXbyak_ ? "4" : "byte 4";
338 case IMM_1:
339 return "4";
340 case IMM_2:
341 return isXbyak_ ? "0xda" : "0xda";
342 case VM32X_32:
343 return isXbyak_ ? "ptr [ebp+64+xmm1*8]" : "[ebp+64+xmm1*8]";
344 case VM32X_64:
345 return isXbyak_ ? "ptr [rax+64+xmm13*2]" : "[rax+64+xmm13*2]";
346 case VM32Y_32:
347 return isXbyak_ ? "ptr [ymm4]" : "[ymm4]";
348 case VM32Y_64:
349 return isXbyak_ ? "ptr [64+ymm13*2+r13]" : "[64+ymm13*2+r13]";
350 case VM32X_K:
351 return isXbyak_ ? "ptr [64+xmm13*2+r13] | k6" : "[64+xmm13*2+r13]{k6}";
352 case VM32Y_K:
353 return isXbyak_ ? "ptr [64+ymm13*2+r13] | k6" : "[64+ymm13*2+r13]{k6}";
354 case VM32Z_K:
355 return isXbyak_ ? "ptr [64+zmm13*2+r13] | k6" : "[64+zmm13*2+r13]{k6}";
356 case VM32Z:
357 return isXbyak_ ? "ptr [64+zmm13*2+rcx]" : "[64+zmm13*2+rcx]";
358 case M_1to2: return isXbyak_ ? "ptr_b [eax+32]" : "[eax+32]{1to2}";
359 case M_1to4: return isXbyak_ ? "ptr_b [eax+32]" : "[eax+32]{1to4}";
360 case M_1to8: return isXbyak_ ? "ptr_b [eax+32]" : "[eax+32]{1to8}";
361 case M_1to16: return isXbyak_ ? "ptr_b [eax+32]" : "[eax+32]{1to16}";
362
363 case M_xword: return isXbyak_ ? "ptr [eax+32]" : "oword [eax+32]";
364 case M_yword: return isXbyak_ ? "yword [eax+32]" : "yword [eax+32]";
365 case MY_1to4: return isXbyak_ ? "yword_b [eax+32]" : "[eax+32]{1to4}";
366 case K:
367 {
368 static const char kTbl[][5] = {
369 "k1", "k2", "k3", "k4", "k5", "k6", "k7",
370 };
371 return kTbl[idx % 7];
372 }
373 case K2:
374 return isXbyak_ ? "k3 | k5" : "k3{k5}";
375#ifdef XBYAK64
376 case XMM_SAE:
377 return isXbyak_ ? "xmm25 | T_sae" : "xmm25, {sae}";
378 case YMM_SAE:
379 return isXbyak_ ? "ymm25 | T_sae" : "ymm25, {sae}";
380 case ZMM_SAE:
381 return isXbyak_ ? "zmm25 | T_sae" : "zmm25, {sae}";
382 case XMM_ER:
383 return isXbyak_ ? "xmm4 | T_rd_sae" : "xmm4, {rd-sae}";
384 case YMM_ER:
385 return isXbyak_ ? "ymm20 | T_rd_sae" : "ymm20, {rd-sae}";
386 case ZMM_ER:
387 return isXbyak_ ? "zmm20 | T_rd_sae" : "zmm20, {rd-sae}";
388 case XMM_KZ:
389 return isXbyak_ ? "xmm5 | k5" : "xmm5{k5}";
390 case YMM_KZ:
391 return isXbyak_ ? "ymm2 |k3|T_z" : "ymm2{k3}{z}";
392 case ZMM_KZ:
393 return isXbyak_ ? "zmm7|k1" : "zmm7{k1}";
394 case MEM_K:
395 return isXbyak_ ? "ptr [rax] | k1" : "[rax]{k1}";
396#else
397 case XMM_SAE:
398 return isXbyak_ ? "xmm5 | T_sae" : "xmm5, {sae}";
399 case YMM_SAE:
400 return isXbyak_ ? "ymm5 | T_sae" : "ymm5, {sae}";
401 case ZMM_SAE:
402 return isXbyak_ ? "zmm5 | T_sae" : "zmm5, {sae}";
403 case XMM_ER:
404 return isXbyak_ ? "xmm30 | T_rd_sae" : "xmm30, {rd-sae}";
405 case YMM_ER:
406 return isXbyak_ ? "ymm2 | T_rd_sae" : "ymm2, {rd-sae}";
407 case ZMM_ER:
408 return isXbyak_ ? "zmm2 | T_rd_sae" : "zmm2, {rd-sae}";
409 case MEM_K:
410 return isXbyak_ ? "ptr [eax] | k1" : "[eax]{k1}";
411#endif
412 case XMM_K:
413 return isXbyak_ ? "xmm5 | k7" : "xmm5{k7}";
414 case YMM_K:
415 return isXbyak_ ? "ymm5 | k4" : "ymm5{k4}";
416 case ZMM_K:
417 return isXbyak_ ? "zmm5 | k3" : "zmm5{k3}";
418 }
419 return 0;
420 }
421public:
422 Test(bool isXbyak)
423 : isXbyak_(isXbyak)
424 , funcNum_(1)
425 {
426 if (!isXbyak_) return;
427 printf("%s",
428 " void gen0()\n"
429 " {\n");
430 }
431 /*
432 gcc and vc give up to compile this source,
433 so I split functions.
434 */
436 {
437 if (!isXbyak_) return;
438 printf(
439 " }\n"
440 " void gen%d()\n"
441 " {\n", funcNum_++);
442 }
444 {
445 if (!isXbyak_) return;
446 printf("%s",
447 " }\n"
448 " void gen()\n"
449 " {\n");
450 for (int i = 0; i < funcNum_; i++) {
451 printf(
452 " gen%d();\n", i);
453 }
454 printf(
455 " }\n");
456 }
457 void put()
458 {
459 putAVX512();
460 }
462 {
463 {
464 const char *tbl[] = {
465 "kadd",
466 "kand",
467 "kandn",
468 "kor",
469 "kxnor",
470 "kxor",
471 };
472 for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
473 std::string name = tbl[i];
474 put(name + "b", K, K, K);
475 put(name + "w", K, K, K);
476 put(name + "q", K, K, K);
477 put(name + "d", K, K, K);
478 }
479 put("kunpckbw", K, K, K);
480 put("kunpckwd", K, K, K);
481 put("kunpckdq", K, K, K);
482 }
483 {
484 const char *tbl[] = {
485 "knot",
486 "kortest",
487 "ktest",
488 };
489 for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
490 std::string name = tbl[i];
491 put(name + "b", K, K);
492 put(name + "w", K, K);
493 put(name + "q", K, K);
494 put(name + "d", K, K);
495 }
496 }
497 {
498 const char *tbl[] = {
499 "kshiftl",
500 "kshiftr",
501 };
502 for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
503 std::string name = tbl[i];
504 put(name + "b", K, K, IMM8);
505 put(name + "w", K, K, IMM8);
506 put(name + "q", K, K, IMM8);
507 put(name + "d", K, K, IMM8);
508 }
509 }
510 put("kmovw", K, K | MEM | REG32);
511 put("kmovq", K, K | MEM);
512 put("kmovb", K, K | MEM | REG32);
513 put("kmovd", K, K | MEM | REG32);
514
515 put("kmovw", MEM | REG32, K);
516 put("kmovq", MEM, K);
517 put("kmovb", MEM | REG32, K);
518 put("kmovd", MEM | REG32, K);
519#ifdef XBYAK64
520 put("kmovq", K, REG64);
521 put("kmovq", REG64, K);
522#endif
523 }
524 void put_vaddpd(const char *r1, const char *r2, const char *r3, int kIdx = 0, bool z = false, int sae = 0)
525 {
526 std::string modifier;
527 char pk[16] = "";
528 const char *pz = "";
529 const char *saeTblXbyak[] = { "", "|T_rn_sae", "|T_rd_sae", "|T_ru_sae", "|T_rz_sae" };
530 const char *saeTblNASM[] = { "", ",{rn-sae}", ",{rd-sae}", ",{ru-sae}", ",{rz-sae}" };
531 if (isXbyak_) {
532 if (kIdx) CYBOZU_SNPRINTF(pk, sizeof(pk), "|k%d", kIdx);
533 if (z) pz = "|T_z";
534 printf("vaddpd(%s%s%s, %s, %s%s); dump();\n", r1, pk, pz, r2, r3, saeTblXbyak[sae]);
535 } else {
536 if (kIdx) CYBOZU_SNPRINTF(pk, sizeof(pk), "{k%d}", kIdx);
537 if (z) pz = "{z}";
538 printf("vaddpd %s%s%s, %s, %s%s\n", r1, pk, pz, r2, r3, saeTblNASM[sae]);
539 }
540 }
541 void putCombi()
542 {
543 const char *xTbl[] = {
544 "xmm2",
545#ifdef XBYAK64
546 "xmm8", "xmm31"
547#else
548 "xmm5", "xmm6"
549#endif
550 };
551 const char *yTbl[] = {
552 "ymm0",
553#ifdef XBYAK64
554 "ymm15", "ymm31"
555#else
556 "ymm4", "ymm2"
557#endif
558 };
559 const char *zTbl[] = {
560 "zmm1",
561#ifdef XBYAK64
562 "zmm9", "zmm30"
563#else
564 "zmm3", "zmm7"
565#endif
566 };
567 const size_t N = NUM_OF_ARRAY(zTbl);
568 for (size_t i = 0; i < N; i++) {
569 for (size_t j = 0; j < N; j++) {
570 separateFunc();
571 for (size_t k = 0; k < N; k++) {
572#ifdef XBYAK64
573 for (int kIdx = 0; kIdx < 8; kIdx++) {
574 for (int z = 0; z < 2; z++) {
575 put_vaddpd(xTbl[i], xTbl[j], xTbl[k], kIdx, z == 1);
576 put_vaddpd(yTbl[i], yTbl[j], yTbl[k], kIdx, z == 1);
577 for (int sae = 0; sae < 5; sae++) {
578 put_vaddpd(zTbl[i], zTbl[j], zTbl[k], kIdx, z == 1, sae);
579 }
580 }
581 }
582#else
583 put_vaddpd(xTbl[i], xTbl[j], xTbl[k]);
584 put_vaddpd(yTbl[i], yTbl[j], yTbl[k]);
585 for (int sae = 0; sae < 5; sae++) {
586 put_vaddpd(zTbl[i], zTbl[j], zTbl[k], sae);
587 }
588#endif
589 }
590 }
591 }
592 put("vaddpd", XMM, XMM, _MEM);
593 put("vaddpd", YMM, YMM, _MEM);
594 put("vaddpd", ZMM, ZMM, _MEM);
595 }
596 void putCmpK()
597 {
598 {
599 const struct Tbl {
600 const char *name;
601 bool supportYMM;
602 } tbl[] = {
603 { "vcmppd", true },
604 { "vcmpps", true },
605 { "vcmpsd", false },
606 { "vcmpss", false },
607 };
608 for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
609 const Tbl *p = &tbl[i];
610 put(p->name, K, _XMM, _XMM | MEM, IMM8);
611 if (!p->supportYMM) continue;
612 put(p->name, K, _YMM, _YMM | MEM, IMM8);
613 put(p->name, K, _ZMM, _ZMM | MEM, IMM8);
614 }
615 }
616 put("vcmppd", K2, ZMM, ZMM_SAE, IMM);
617#ifdef XBYAK64
618 {
619 const struct Tbl {
620 const char *name;
621 } tbl[] = {
622 { "vcomisd" },
623 { "vcomiss" },
624 { "vucomisd" },
625 { "vucomiss" },
626 };
627 for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
628 const Tbl *p = &tbl[i];
629 put(p->name, XMM | _XMM3, XMM_SAE | XMM | MEM);
630 }
631 }
632 put("vcomiss", _XMM3, XMM | MEM);
633 put("vcomiss", XMM, XMM_SAE);
634#endif
635 }
636 void putBroadcastSub(int idx, int disp)
637 {
638#ifdef XBYAK64
639 const char *a = "rax";
640#else
641 const char *a = "eax";
642#endif
643 if (isXbyak_) {
644 printf("vaddpd(zmm%d, zmm1, ptr_b[%s+%d]);dump();\n", idx, a, disp);
645 printf("vaddpd(ymm%d, ymm1, ptr_b[%s+%d]);dump();\n", idx, a, disp);
646 printf("vaddpd(xmm%d, xmm1, ptr_b[%s+%d]);dump();\n", idx, a, disp);
647 } else {
648 printf("vaddpd zmm%d, zmm1, [%s+%d]{1to8}\n", idx, a, disp);
649 printf("vaddpd ymm%d, ymm1, [%s+%d]{1to4}\n", idx, a, disp);
650 printf("vaddpd xmm%d, xmm1, [%s+%d]{1to2}\n", idx, a, disp);
651 }
652 }
654 {
655 for (int i = 0; i < 9; i++) {
656 putBroadcastSub(0, i);
657#ifdef XBYAK64
658 putBroadcastSub(10, i);
659 putBroadcastSub(20, i);
660#endif
661 }
662 put("vpbroadcastb", XMM_KZ | ZMM_KZ, REG8 | _MEM);
663 put("vpbroadcastw", XMM_KZ | ZMM_KZ, REG16 | _MEM);
664 put("vpbroadcastd", XMM_KZ | ZMM_KZ, REG32 | _MEM);
665#ifdef XBYAK64
666 put("vpbroadcastq", XMM_KZ | ZMM_KZ, REG64 | _MEM);
667#endif
668 {
669 const char *tbl[] = {
670 "vpbroadcastb",
671 "vpbroadcastw",
672 "vpbroadcastd",
673 "vpbroadcastq",
674 };
675 for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
676 put(tbl[i], XMM_KZ | ZMM_KZ, _XMM | _MEM);
677 }
678 }
679 put("vbroadcasti32x2", XMM_KZ | YMM_KZ | ZMM_KZ, _XMM | _MEM);
680 put("vbroadcasti32x4", YMM_KZ | ZMM_KZ, _MEM);
681 put("vbroadcasti64x2", YMM_KZ | ZMM_KZ, _MEM);
682 put("vbroadcasti32x8", ZMM_KZ, _MEM);
683 put("vbroadcasti64x4", ZMM_KZ, _MEM);
684 }
685 void putMisc1()
686 {
687 put("vmaskmovps", XMM, XMM, MEM);
688 put("vmaskmovps", YMM, YMM, MEM);
689
690 put("vmaskmovpd", YMM, YMM, MEM);
691 put("vmaskmovpd", XMM, XMM, MEM);
692
693 put("vmaskmovps", MEM, XMM, XMM);
694 put("vmaskmovpd", MEM, XMM, XMM);
695
696 put("vbroadcastf128", YMM, MEM);
697 put("vbroadcasti128", YMM, MEM);
698 put("vbroadcastsd", YMM|_YMM3, XMM|MEM);
699 put("vbroadcastsd", ZMM, XMM|MEM);
700 {
701 const char *tbl[] = {
702 "vbroadcastss",
703 "vpbroadcastb",
704 "vpbroadcastw",
705 "vpbroadcastd",
706 "vpbroadcastq",
707 };
708 for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
709 put(tbl[i], XMM | YMM | ZMM, XMM|MEM);
710 }
711 }
712
713 put("vinsertf128", YMM, YMM, XMM | MEM, IMM8);
714 put("vinserti128", YMM, YMM, XMM | MEM, IMM8);
715 put("vperm2f128", YMM, YMM, YMM | MEM, IMM8);
716 put("vperm2i128", YMM, YMM, YMM | MEM, IMM8);
717
718 {
719 const char *tbl[] = {
720 "vpmaskmovd", "vpmaskmovq"
721 };
722 for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
723 const char *name = tbl[i];
724 put(name, XMM, XMM, MEM);
725 put(name, YMM, YMM, MEM);
726 put(name, MEM, XMM, XMM);
727 put(name, MEM, YMM, YMM);
728 }
729 }
730 {
731 const char *tbl[] = {
732 "vpermd", "vpermps",
733 };
734 for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
735 const char *name = tbl[i];
736 put(name, YMM, YMM, YMM | MEM);
737 }
738 }
739 {
740 const char *tbl[] = {
741 "vpermq", "vpermpd",
742 };
743 for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
744 const char *name = tbl[i];
745 put(name, YMM, YMM | MEM, IMM8);
746 }
747 }
748 put("vpextrw", REG32e | MEM, XMM, IMM); // nasm is ok, yasm generate redundant code
749 }
751 {
752 const char *tbl[] = {
753 "vmovapd",
754 "vmovaps",
755 "vmovupd",
756 "vmovups",
757 };
758 for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
759 const char *name = tbl[i];
760 put(name, MEM, ZMM);
761 put(name, ZMM, MEM);
762#ifdef XBYAK64
763 put(name, MEM, _XMM3);
764 put(name, _XMM3, MEM);
765#endif
766 }
767 }
768 void put_vmov()
769 {
770#ifdef XBYAK64
771 put("vmovd", _XMM3, MEM|REG32);
772 put("vmovd", MEM|REG32, _XMM3);
773 put("vmovq", _XMM3, MEM|REG64|XMM);
774 put("vmovq", MEM|REG64|XMM, _XMM3);
775 put("vmovhlps", _XMM3, _XMM3, _XMM3);
776 put("vmovlhps", _XMM3, _XMM3, _XMM3);
777 put("vmovntdqa", _XMM3|_YMM3|ZMM, MEM);
778 put("vmovntdq", MEM, _XMM3 | _YMM3 | ZMM);
779 put("vmovntpd", MEM, _XMM3 | _YMM3 | ZMM);
780 put("vmovntps", MEM, _XMM3 | _YMM3 | ZMM);
781
782 put("vmovsd", XMM_KZ, _XMM3, _XMM3);
783 put("vmovsd", XMM_KZ, MEM);
784 put("vmovsd", MEM_K, XMM);
785 put("vmovss", XMM_KZ, _XMM3, _XMM3);
786 put("vmovss", XMM_KZ, MEM);
787 put("vmovss", MEM_K, XMM);
788
789 put("vmovshdup", _ZMM, _ZMM);
790 put("vmovsldup", _ZMM, _ZMM);
791
792
793 {
794 const char *tbl[] = {
795 "valignd",
796 "valignq",
797 };
798 for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
799 const char *name = tbl[i];
800 put(name, XMM_KZ, _XMM, _XMM | MEM, IMM);
802 put(name, _ZMM, _ZMM, _ZMM | _MEM, IMM);
803 }
804 }
805 {
806 const char tbl[][16] = {
807 "vmovhpd",
808 "vmovhps",
809 "vmovlpd",
810 "vmovlps",
811 };
812 for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
813 put(tbl[i], _XMM3, _XMM3, MEM);
814 put(tbl[i], MEM, _XMM3);
815 }
816 }
817#endif
818 }
820 {
821 const struct Tbl {
822 const char *name;
823 bool M_X;
824 } tbl[] = {
825 { "vmovddup", false },
826 { "vmovdqa32", true },
827 { "vmovdqa64", true },
828 { "vmovdqu8", true },
829 { "vmovdqu16", true },
830 { "vmovdqu32", true },
831 { "vmovdqu64", true },
832 { "vpabsb", false },
833 { "vpabsw", false },
834 { "vpabsd", false },
835 { "vpabsq", false },
836 };
837 for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
838 const Tbl& p = tbl[i];
839 put(p.name, _XMM|XMM_KZ, _XMM|MEM);
840 put(p.name, _YMM|YMM_KZ, _YMM|MEM);
841 put(p.name, _ZMM|ZMM_KZ, _ZMM|MEM);
842 if (!p.M_X) continue;
843 put(p.name, MEM|MEM_K, _XMM);
844 put(p.name, MEM|MEM_K, _YMM);
845 put(p.name, MEM|MEM_K, _ZMM);
846 }
847 put("vsqrtpd", XMM_KZ, M_1to2 | _MEM);
848 put("vsqrtpd", YMM_KZ, M_1to4 | _MEM);
849 put("vsqrtpd", ZMM_KZ, M_1to8 | _MEM);
850 put("vsqrtpd", ZMM_KZ, ZMM_ER);
851
852 put("vsqrtps", XMM_KZ, M_1to4 | _MEM);
853 put("vsqrtps", YMM_KZ, M_1to8 | _MEM);
854 put("vsqrtps", ZMM_KZ, M_1to16 | _MEM);
855 put("vsqrtps", ZMM_KZ, ZMM_ER);
856
857 put("vpabsd", ZMM_KZ, M_1to16 | _MEM);
858 put("vpabsq", ZMM_KZ, M_1to8 | _MEM);
859
860 put("vbroadcastf32x2", YMM_KZ | ZMM_KZ, _XMM | _MEM);
861 put("vbroadcastf32x4", YMM_KZ | ZMM_KZ, _MEM);
862
863 put("vbroadcastf64x2", YMM_KZ | ZMM_KZ, _MEM);
864 put("vbroadcastf64x4", ZMM_KZ, _MEM);
865 put("vbroadcastf32x8", ZMM_KZ, _MEM);
866 }
868 {
869 const struct Tbl {
870 const char *name;
871 uint64_t mem;
872 } tbl[] = {
873 { "vsqrtsd", MEM },
874 { "vsqrtss", MEM },
875 { "vunpckhpd", M_1to2 },
876 { "vunpckhps", M_1to4 },
877 { "vunpcklpd", M_1to2 },
878 { "vunpcklps", M_1to4 },
879 };
880 for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
881 const Tbl& p = tbl[i];
882 put(p.name, XMM_KZ, _XMM, _XMM|p.mem);
883 }
884 }
886 {
887#ifdef XBYAK64
888 const struct Tbl {
889 const char *name;
890 uint64_t x1;
891 uint64_t x2;
892 uint64_t xm;
893 } tbl[] = {
894 { "vpacksswb", XMM_KZ, _XMM, _XMM | _MEM },
895 { "vpacksswb", YMM_KZ, _YMM, _YMM | _MEM },
896 { "vpacksswb", ZMM_KZ, _ZMM, _ZMM | _MEM },
897
898 { "vpackssdw", XMM_KZ, _XMM, _XMM | M_1to4 | _MEM },
899 { "vpackssdw", YMM_KZ, _YMM, _YMM | M_1to8 | _MEM },
900 { "vpackssdw", ZMM_KZ, _ZMM, _ZMM | M_1to16 | _MEM },
901
902 { "vpackusdw", XMM_KZ, _XMM, _XMM | M_1to4 | _MEM },
903 { "vpackusdw", YMM_KZ, _YMM, _YMM | M_1to8 | _MEM },
904 { "vpackusdw", ZMM_KZ, _ZMM, _ZMM | M_1to16 | _MEM },
905
906 { "vpackuswb", XMM_KZ, _XMM, _XMM | _MEM },
907 { "vpackuswb", YMM_KZ, _YMM, _YMM | _MEM },
908 { "vpackuswb", ZMM_KZ, _ZMM, _ZMM | _MEM },
909
910 { "vpaddb", XMM_KZ, _XMM, _XMM | _MEM },
911 { "vpaddw", XMM_KZ, _XMM, _XMM | _MEM },
912 { "vpaddd", XMM_KZ, _XMM, _XMM | M_1to4 | _MEM },
913 { "vpaddq", ZMM_KZ, _ZMM, M_1to8 | _MEM },
914
915 { "vpaddsb", XMM_KZ, _XMM, _XMM | _MEM },
916 { "vpaddsb", ZMM_KZ, _ZMM, _ZMM | _MEM },
917
918 { "vpaddsw", XMM_KZ, _XMM, _XMM | _MEM },
919 { "vpaddsw", ZMM_KZ, _ZMM, _ZMM | _MEM },
920
921 { "vpaddusb", XMM_KZ, _XMM, _XMM | MEM },
922 { "vpaddusb", ZMM_KZ, _ZMM, _ZMM | MEM },
923
924 { "vpaddusw", XMM_KZ, _XMM, _XMM | MEM },
925 { "vpaddusw", ZMM_KZ, _ZMM, _ZMM | MEM },
926
927 { "vpsubb", XMM_KZ, _XMM, _XMM | _MEM },
928 { "vpsubw", XMM_KZ, _XMM, _XMM | _MEM },
929 { "vpsubd", XMM_KZ, _XMM, _XMM | M_1to4 | _MEM },
930 { "vpsubq", ZMM_KZ, _ZMM, M_1to8 | _MEM },
931
932 { "vpsubsb", XMM_KZ, _XMM, _XMM | _MEM },
933 { "vpsubsb", ZMM_KZ, _ZMM, _ZMM | _MEM },
934
935 { "vpsubsw", XMM_KZ, _XMM, _XMM | _MEM },
936 { "vpsubsw", ZMM_KZ, _ZMM, _ZMM | _MEM },
937
938 { "vpsubusb", XMM_KZ, _XMM, _XMM | MEM },
939 { "vpsubusb", ZMM_KZ, _ZMM, _ZMM | MEM },
940
941 { "vpsubusw", XMM_KZ, _XMM, _XMM | MEM },
942 { "vpsubusw", ZMM_KZ, _ZMM, _ZMM | MEM },
943
944 { "vpandd", ZMM_KZ, _ZMM, _ZMM | M_1to16 | _MEM },
945 { "vpandq", ZMM_KZ, _ZMM, _ZMM | M_1to8 | _MEM },
946
947 { "vpandnd", ZMM_KZ, _ZMM, _ZMM | M_1to16 | _MEM },
948 { "vpandnq", ZMM_KZ, _ZMM, _ZMM | M_1to8 | _MEM },
949
950 { "vpavgb", ZMM_KZ, _ZMM, _ZMM },
951 { "vpavgw", ZMM_KZ, _ZMM, _ZMM },
952
953 { "vpcmpeqb", K2, _ZMM, _ZMM | _MEM },
954 { "vpcmpeqw", K2, _ZMM, _ZMM | _MEM },
955 { "vpcmpeqd", K2, _ZMM, _ZMM | M_1to16 | _MEM },
956 { "vpcmpeqq", K2, _ZMM, _ZMM | M_1to8 | _MEM },
957
958 { "vpcmpgtb", K2, _ZMM, _ZMM | _MEM },
959 { "vpcmpgtw", K2, _ZMM, _ZMM | _MEM },
960 { "vpcmpgtd", K2, _ZMM, _ZMM | M_1to16 | _MEM },
961 { "vpcmpgtq", K2, _ZMM, _ZMM | M_1to8 | _MEM },
962
963 { "vpmaddubsw", ZMM_KZ, _ZMM, _ZMM | _MEM },
964 { "vpmaddwd", ZMM_KZ, _ZMM, _ZMM | _MEM },
965
966 { "vpmaxsb", ZMM_KZ, _ZMM, _ZMM | _MEM },
967 { "vpmaxsw", ZMM_KZ, _ZMM, _ZMM | _MEM },
968 { "vpmaxsd", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to16 },
969 { "vpmaxsq", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to8 },
970
971 { "vpmaxub", ZMM_KZ, _ZMM, _ZMM | _MEM },
972 { "vpmaxuw", ZMM_KZ, _ZMM, _ZMM | _MEM },
973 { "vpmaxud", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to16 },
974 { "vpmaxuq", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to8 },
975
976 { "vpminsb", ZMM_KZ, _ZMM, _ZMM | _MEM },
977 { "vpminsw", ZMM_KZ, _ZMM, _ZMM | _MEM },
978 { "vpminsd", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to16 },
979 { "vpminsq", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to8 },
980
981 { "vpminub", ZMM_KZ, _ZMM, _ZMM | _MEM },
982 { "vpminuw", ZMM_KZ, _ZMM, _ZMM | _MEM },
983 { "vpminud", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to16 },
984 { "vpminuq", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to8 },
985
986 { "vpslldq", _XMM3, _XMM3 | _MEM, IMM8 },
987 { "vpslldq", _YMM3, _YMM3 | _MEM, IMM8 },
988 { "vpslldq", _ZMM, _ZMM | _MEM, IMM8 },
989
990 { "vpsrldq", _XMM3, _XMM3 | _MEM, IMM8 },
991 { "vpsrldq", _YMM3, _YMM3 | _MEM, IMM8 },
992 { "vpsrldq", _ZMM, _ZMM | _MEM, IMM8 },
993
994 { "vpsraw", XMM_KZ, _XMM | _MEM, IMM8 },
995 { "vpsraw", ZMM_KZ, _ZMM | _MEM, IMM8 },
996
997 { "vpsrad", XMM_KZ, _XMM | M_1to4 | _MEM, IMM8 },
998 { "vpsrad", ZMM_KZ, _ZMM | M_1to16 | _MEM, IMM8 },
999
1000 { "vpsraq", XMM, XMM, IMM8 },
1001 { "vpsraq", XMM_KZ, _XMM | M_1to2 | _MEM, IMM8 },
1002 { "vpsraq", ZMM_KZ, _ZMM | M_1to8 | _MEM, IMM8 },
1003
1004 { "vpsllw", _XMM3, _XMM3 | _MEM, IMM8 },
1005 { "vpslld", _XMM3, _XMM3 | _MEM | M_1to4, IMM8 },
1006 { "vpsllq", _XMM3, _XMM3 | _MEM | M_1to2, IMM8 },
1007
1008 { "vpsrlw", XMM_KZ, _XMM | _MEM, IMM8 },
1009 { "vpsrlw", ZMM_KZ, _ZMM | _MEM, IMM8 },
1010
1011 { "vpsrld", XMM_KZ, _XMM | M_1to4 | _MEM, IMM8 },
1012 { "vpsrld", ZMM_KZ, _ZMM | M_1to16 | _MEM, IMM8 },
1013
1014 { "vpsrlq", _XMM3, _XMM3 | _MEM | M_1to2, IMM8 },
1015 { "vpsrlq", _ZMM, _ZMM | _MEM | M_1to8, IMM8 },
1016
1017 { "vpsravw", XMM_KZ | _XMM, _XMM, _XMM | _MEM },
1018 { "vpsravw", _ZMM, _ZMM, _MEM },
1019
1020 { "vpsravd", XMM_KZ | _XMM, _XMM, _XMM | _MEM },
1021 { "vpsravd", _ZMM, _ZMM, M_1to16 | _MEM },
1022
1023 { "vpsravq", XMM_KZ | _XMM, _XMM, _XMM | _MEM },
1024 { "vpsravq", _ZMM, _ZMM, M_1to8 | _MEM },
1025
1026 { "vpsllvw", XMM_KZ | _XMM, _XMM, _XMM | _MEM },
1027 { "vpsllvw", _ZMM, _ZMM, _MEM },
1028
1029 { "vpsllvd", XMM_KZ | _XMM, _XMM, _XMM | _MEM },
1030 { "vpsllvd", _ZMM, _ZMM, M_1to16 | _MEM },
1031
1032 { "vpsllvq", XMM_KZ | _XMM, _XMM, _XMM | _MEM },
1033 { "vpsllvq", _ZMM, _ZMM, M_1to8 | _MEM },
1034
1035 { "vpsrlvw", XMM_KZ | _XMM, _XMM, _XMM | _MEM },
1036 { "vpsrlvw", _ZMM, _ZMM, _MEM },
1037
1038 { "vpsrlvd", XMM_KZ | _XMM, _XMM, _XMM | _MEM },
1039 { "vpsrlvd", _ZMM, _ZMM, M_1to16 | _MEM },
1040
1041 { "vpsrlvq", XMM_KZ | _XMM, _XMM, _XMM | _MEM },
1042 { "vpsrlvq", _ZMM, _ZMM, M_1to8 | _MEM },
1043
1044 { "vpshufb", _XMM | XMM_KZ, _XMM, _XMM | _MEM },
1045 { "vpshufb", ZMM_KZ, _ZMM, _MEM },
1046
1047 { "vpshufhw", _XMM | XMM_KZ, _XMM | _MEM, IMM8 },
1048 { "vpshufhw", ZMM_KZ, _MEM, IMM8 },
1049
1050 { "vpshuflw", _XMM | XMM_KZ, _XMM | _MEM, IMM8 },
1051 { "vpshuflw", ZMM_KZ, _MEM, IMM8 },
1052
1053 { "vpshufd", _XMM | XMM_KZ, _XMM | M_1to4 | _MEM, IMM8 },
1054 { "vpshufd", _ZMM | ZMM_KZ, _ZMM | M_1to16 | _MEM, IMM8 },
1055
1056 { "vpord", _XMM | XMM_KZ, _XMM, _XMM | M_1to4 | _MEM },
1057 { "vpord", _ZMM | ZMM_KZ, _ZMM, M_1to16 | _MEM },
1058
1059 { "vporq", _XMM | XMM_KZ, _XMM, _XMM | M_1to2 | _MEM },
1060 { "vporq", _ZMM | ZMM_KZ, _ZMM, M_1to8 | _MEM },
1061
1062 { "vpxord", _XMM | XMM_KZ, _XMM, _XMM | M_1to4 | _MEM },
1063 { "vpxord", _ZMM | ZMM_KZ, _ZMM, M_1to16 | _MEM },
1064
1065 { "vpxorq", _XMM | XMM_KZ, _XMM, _XMM | M_1to2 | _MEM },
1066 { "vpxorq", _ZMM | ZMM_KZ, _ZMM, M_1to8 | _MEM },
1067
1068 { "vpsadbw", _XMM3, _XMM, _XMM | _MEM },
1069 { "vpsadbw", _ZMM, _ZMM, _MEM },
1070
1071 { "vpmuldq", _XMM3, _XMM, _XMM | M_1to2 | _MEM },
1072 { "vpmuldq", ZMM_KZ, _ZMM, M_1to8 | _MEM },
1073
1074 { "vpmulhrsw", _XMM3, _XMM, _XMM | _MEM },
1075 { "vpmulhrsw", ZMM_KZ, _ZMM, _MEM },
1076
1077 { "vpmulhuw", _XMM3, _XMM, _XMM | _MEM },
1078 { "vpmulhuw", ZMM_KZ, _ZMM, _MEM },
1079
1080 { "vpmulhw", _XMM3, _XMM, _XMM | _MEM },
1081 { "vpmulhw", ZMM_KZ, _ZMM, _MEM },
1082
1083 { "vpmullw", _XMM3, _XMM, _XMM | _MEM },
1084 { "vpmullw", ZMM_KZ, _ZMM, _MEM },
1085
1086 { "vpmulld", _XMM3, _XMM, M_1to4 | _MEM },
1087 { "vpmulld", ZMM_KZ, _ZMM, M_1to16 | _MEM },
1088
1089 { "vpmullq", _XMM3, _XMM, M_1to2 | _MEM },
1090 { "vpmullq", ZMM_KZ, _ZMM, M_1to8 | _MEM },
1091
1092 { "vpmuludq", _XMM3, _XMM, M_1to2 | _MEM },
1093 { "vpmuludq", ZMM_KZ, _ZMM, M_1to8 | _MEM },
1094
1095 { "vpunpckhbw", _XMM3, _XMM, _XMM | _MEM },
1096 { "vpunpckhbw", _ZMM, _ZMM, _MEM },
1097
1098 { "vpunpckhwd", _XMM3, _XMM, _XMM | _MEM },
1099 { "vpunpckhwd", _ZMM, _ZMM, _MEM },
1100
1101 { "vpunpckhdq", _XMM3, _XMM, M_1to4 | _MEM },
1102 { "vpunpckhdq", _ZMM, _ZMM, M_1to16 | _MEM },
1103
1104 { "vpunpckhqdq", _XMM3, _XMM, M_1to2 | _MEM },
1105 { "vpunpckhqdq", _ZMM, _ZMM, M_1to8 | _MEM },
1106
1107 { "vpunpcklbw", _XMM3, _XMM, _XMM | _MEM },
1108 { "vpunpcklbw", _ZMM, _ZMM, _MEM },
1109
1110 { "vpunpcklwd", _XMM3, _XMM, _XMM | _MEM },
1111 { "vpunpcklwd", _ZMM, _ZMM, _MEM },
1112
1113 { "vpunpckldq", _XMM3, _XMM, M_1to4 | _MEM },
1114 { "vpunpckldq", _ZMM, _ZMM, M_1to16 | _MEM },
1115
1116 { "vpunpcklqdq", _XMM3, _XMM, M_1to2 | _MEM },
1117 { "vpunpcklqdq", _ZMM, _ZMM, M_1to8 | _MEM },
1118
1119 { "vextractf32x4", _XMM | XMM_KZ | _MEM, _YMM | _ZMM, IMM8 },
1120 { "vextractf64x2", _XMM | XMM_KZ | _MEM, _YMM | _ZMM, IMM8 },
1121 { "vextractf32x8", _YMM | YMM_KZ | _MEM, _ZMM, IMM8 },
1122 { "vextractf64x4", _YMM | YMM_KZ | _MEM, _ZMM, IMM8 },
1123
1124 { "vextracti32x4", _XMM | XMM_KZ | _MEM, _YMM | _ZMM, IMM8 },
1125 { "vextracti64x2", _XMM | XMM_KZ | _MEM, _YMM | _ZMM, IMM8 },
1126 { "vextracti32x8", _YMM | YMM_KZ | _MEM, _ZMM, IMM8 },
1127 { "vextracti64x4", _YMM | YMM_KZ | _MEM, _ZMM, IMM8 },
1128
1129 { "vextractps", REG32 | _MEM, _XMM3, IMM8 },
1130
1131 { "vpermb", XMM_KZ, _XMM, _XMM | _MEM },
1132 { "vpermb", ZMM_KZ, _ZMM, _ZMM | _MEM },
1133
1134 { "vpermw", XMM_KZ, _XMM, _XMM | _MEM },
1135 { "vpermw", ZMM_KZ, _ZMM, _ZMM | _MEM },
1136
1137 { "vpermd", YMM_KZ, _YMM, _YMM | M_1to8 | _MEM },
1138 { "vpermd", ZMM_KZ, _ZMM, _ZMM | M_1to16 | _MEM },
1139
1140 { "vpermilpd", XMM_KZ, _XMM, _XMM | M_1to2 | _MEM },
1141 { "vpermilpd", ZMM_KZ, _ZMM, M_1to8 | _MEM },
1142 { "vpermilpd", XMM_KZ, M_1to2 | _MEM, IMM8 },
1143 { "vpermilpd", ZMM_KZ, M_1to8 | _MEM, IMM8 },
1144
1145 { "vpermilps", XMM_KZ, _XMM, _XMM | _MEM | M_1to4 },
1146 { "vpermilps", ZMM_KZ, _ZMM, _MEM | M_1to16 },
1147 { "vpermilps", XMM_KZ, _MEM | M_1to4 | _MEM, IMM8 },
1148 { "vpermilps", ZMM_KZ, _MEM | M_1to16 | _MEM, IMM8 },
1149
1150 { "vpermpd", YMM_KZ, _YMM | M_1to4 | _MEM, IMM8 },
1151 { "vpermpd", ZMM_KZ, _ZMM | M_1to8 | _MEM, IMM8 },
1152 { "vpermpd", YMM_KZ, _YMM, M_1to4 | _MEM },
1153 { "vpermpd", ZMM_KZ, _ZMM, M_1to8 | _MEM },
1154
1155 { "vpermps", YMM_KZ, _YMM, M_1to8 | _MEM },
1156 { "vpermps", ZMM_KZ, _ZMM, M_1to16 | _MEM },
1157
1158 { "vpermq", YMM_KZ, _YMM | M_1to4 | _MEM, IMM8 },
1159 { "vpermq", ZMM_KZ, _ZMM | M_1to8 | _MEM, IMM8 },
1160 { "vpermq", YMM_KZ, _YMM, M_1to4 | _MEM },
1161 { "vpermq", ZMM_KZ, _ZMM, M_1to8 | _MEM },
1162 };
1163 for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
1164 const Tbl& p = tbl[i];
1165 put(p.name, p.x1, p.x2, p.xm);
1166 }
1167#endif
1168 }
1170 {
1171 const struct Tbl {
1172 const char *name;
1173 uint64_t x1;
1174 uint64_t x2;
1175 uint64_t xm;
1176 } tbl[] = {
1177#ifdef XBYAK64
1178 { "vinsertps", _XMM, _XMM, _XMM3 | _MEM },
1179
1180 { "vshufpd", XMM_KZ, _XMM, M_1to2 | _MEM },
1181 { "vshufpd", ZMM_KZ, _ZMM, M_1to8 | _MEM },
1182
1183 { "vshufps", XMM_KZ, _XMM, M_1to4 | _MEM },
1184 { "vshufps", ZMM_KZ, _ZMM, M_1to16 | _MEM },
1185
1186 { "vinsertf32x4", _YMM | YMM_KZ, _YMM, _XMM | _MEM },
1187 { "vinsertf32x4", _ZMM | ZMM_KZ, _ZMM, _XMM | _MEM },
1188
1189 { "vinsertf64x2", _YMM | YMM_KZ, _YMM, _XMM | _MEM },
1190 { "vinsertf64x2", _ZMM | ZMM_KZ, _ZMM, _XMM | _MEM },
1191
1192 { "vinsertf32x8", _ZMM | ZMM_KZ, _ZMM, _YMM | _MEM },
1193 { "vinsertf64x4", _ZMM | ZMM_KZ, _ZMM, _YMM | _MEM },
1194
1195 { "vinserti32x4", _YMM | YMM_KZ, _YMM, _XMM | _MEM },
1196 { "vinserti32x4", _ZMM | ZMM_KZ, _ZMM, _XMM | _MEM },
1197
1198 { "vinserti64x2", _YMM | YMM_KZ, _YMM, _XMM | _MEM },
1199 { "vinserti64x2", _ZMM | ZMM_KZ, _ZMM, _XMM | _MEM },
1200
1201 { "vinserti32x8", _ZMM | ZMM_KZ, _ZMM, _YMM | _MEM },
1202 { "vinserti64x4", _ZMM | ZMM_KZ, _ZMM, _YMM | _MEM },
1203#endif
1204 { "vpalignr", ZMM_KZ, _ZMM, _ZMM | _MEM },
1205 };
1206 for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
1207 const Tbl& p = tbl[i];
1208 put(p.name, p.x1, p.x2, p.xm, IMM8);
1209 }
1210#ifdef XBYAK64
1211 put("vpextrb", _REG64 | _MEM, _XMM3, IMM8);
1212 put("vpextrw", _REG64 | _MEM, _XMM3, IMM8);
1213 put("vpextrd", _REG32 | _MEM, _XMM3, IMM8);
1214 put("vpextrq", _REG64 | _MEM, _XMM3, IMM8);
1215 put("vpinsrb", _XMM3, _XMM3, _REG32 | _MEM, IMM8);
1216 put("vpinsrw", _XMM3, _XMM3, _REG32 | _MEM, IMM8);
1217 put("vpinsrd", _XMM3, _XMM3, _REG32 | _MEM, IMM8);
1218 put("vpinsrq", _XMM3, _XMM3, _REG64 | _MEM, IMM8);
1219#endif
1220 }
1222 {
1223 const struct Tbl {
1224 const char *name;
1225 bool supportYMM;
1226 } tbl[] = {
1227 { "vfmadd", true },
1228 { "vfmadd", false },
1229 { "vfmaddsub", true },
1230 { "vfmsubadd", true },
1231 { "vfmsub", true },
1232 { "vfmsub", false },
1233 { "vfnmadd", true },
1234 { "vfnmadd", false },
1235 { "vfnmsub", true },
1236 { "vfnmsub", false },
1237 };
1238 for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
1239 const Tbl& p = tbl[i];
1240 const struct Ord {
1241 const char *name;
1242 } ord[] = {
1243 { "132" },
1244 { "213" },
1245 { "231" },
1246 };
1247 for (size_t j = 0; j < NUM_OF_ARRAY(ord); j++) {
1248 const char sufTbl[][2][8] = {
1249 { "pd", "ps" },
1250 { "sd", "ss" },
1251 };
1252 for (size_t k = 0; k < 2; k++) {
1253 const std::string suf = sufTbl[p.supportYMM ? 0 : 1][k];
1254 uint64_t mem = 0;
1255 if (suf == "pd") {
1256 mem = M_1to2;
1257 } else if (suf == "ps") {
1258 mem = M_1to4;
1259 } else {
1260 mem = XMM_ER;
1261 }
1262 std::string name = std::string(p.name) + ord[j].name + suf;
1263 const char *q = name.c_str();
1264 put(q, XMM_KZ, _XMM, mem | _MEM);
1265 if (!p.supportYMM) continue;
1266 if (suf == "pd") {
1267 mem = M_1to8;
1268 } else if (suf == "ps") {
1269 mem = M_1to16;
1270 } else {
1271 mem = XMM_ER;
1272 }
1273 put(q, _ZMM, _ZMM, mem | _MEM);
1274 }
1275 }
1276 }
1277 }
1279 {
1280 const struct Tbl {
1281 const char *name;
1282 bool all_xmm; // 2nd param
1283 } tbl[] = {
1284 { "vpmovsxbw", false },
1285 { "vpmovsxbd", true },
1286 { "vpmovsxbq", true },
1287 { "vpmovsxwd", false },
1288 { "vpmovsxwq", true },
1289 { "vpmovsxdq", false },
1290
1291 { "vpmovzxbw", false },
1292 { "vpmovzxbd", true },
1293 { "vpmovzxbq", true },
1294 { "vpmovzxwd", false },
1295 { "vpmovzxwq", true },
1296 { "vpmovzxdq", false },
1297 };
1298 for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
1299 const Tbl& p = tbl[i];
1300 const char *name = p.name;
1301 put(name, XMM_KZ | YMM, _XMM | _MEM);
1302 if (p.all_xmm) {
1303 put(name, ZMM, _XMM | _MEM);
1304 } else {
1305 put(name, ZMM, YMM | _MEM);
1306 }
1307 }
1308 }
1310 {
1311#ifdef XBYAK64
1312 const struct Tbl {
1313 std::string name;
1314 bool only_pd_ps;
1315 } tbl[] = {
1316 { "vadd", false },
1317 { "vsub", false },
1318 { "vmul", false },
1319 { "vdiv", false },
1320 { "vmax", false },
1321 { "vmin", false },
1322 { "vand", true },
1323 { "vandn", true },
1324 { "vor", true },
1325 { "vxor", true },
1326 };
1327 for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
1328 const struct Suf {
1329 const char *suf;
1330 bool supportYMM;
1331 } sufTbl[] = {
1332 { "pd", true },
1333 { "ps", true },
1334 { "sd", false },
1335 { "ss", false },
1336 };
1337 for (size_t j = 0; j < NUM_OF_ARRAY(sufTbl); j++) {
1338 if (tbl[i].only_pd_ps && j == 2) break;
1339 std::string suf = sufTbl[j].suf;
1340 std::string name = tbl[i].name + suf;
1341 const char *p = name.c_str();
1342 uint64_t mem = 0;
1343 if (suf == "pd") {
1344 mem = M_1to2;
1345 } else if (suf == "ps") {
1346 mem = M_1to4;
1347 }
1348 put(p, _XMM3 | XMM_KZ, _XMM, mem | _MEM);
1349 if (!sufTbl[j].supportYMM) continue;
1350 mem = 0;
1351 if (suf == "pd") {
1352 mem = M_1to8;
1353 } else if (suf == "ps") {
1354 mem = M_1to16;
1355 }
1356 put(p, _ZMM, _ZMM, mem | _MEM);
1357 }
1358 }
1359#endif
1360 }
1361 void putAVX1()
1362 {
1363 const struct Tbl {
1364 const char *name;
1365 bool only_pd_ps;
1366 } tbl[] = {
1367 { "add", false },
1368 { "sub", false },
1369 { "mul", false },
1370 { "div", false },
1371 { "max", false },
1372 { "min", false },
1373 { "and", true },
1374 { "andn", true },
1375 { "or", true },
1376 { "xor", true },
1377 };
1378 for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
1379 const struct Suf {
1380 const char *suf;
1381 bool supportYMM;
1382 } suf[] = {
1383 { "pd", true },
1384 { "ps", true },
1385 { "sd", false },
1386 { "ss", false },
1387 };
1388 for (size_t j = 0; j < NUM_OF_ARRAY(suf); j++) {
1389 if (tbl[i].only_pd_ps && j == 2) break;
1390 std::string name = std::string("v") + tbl[i].name + suf[j].suf;
1391 const char *p = name.c_str();
1392 put(p, XMM, XMM | MEM);
1393 put(p, XMM, XMM, XMM | MEM);
1394 if (!suf[j].supportYMM) continue;
1395 put(p, YMM, YMM | MEM);
1396 put(p, YMM, YMM, YMM | MEM);
1397 put(p, ZMM, ZMM, ZMM | MEM);
1398 }
1399 }
1400 }
1402 {
1403#ifdef XBYAK64
1404 put("vcvtdq2pd", XMM_KZ, _XMM | _MEM | M_1to2);
1405 put("vcvtdq2pd", YMM_KZ, _XMM | _MEM | M_1to4);
1406 put("vcvtdq2pd", ZMM_KZ, _YMM | _MEM | M_1to8);
1407
1408 put("vcvtdq2ps", XMM_KZ, _XMM | _MEM | M_1to4);
1409 put("vcvtdq2ps", YMM_KZ, _YMM | _MEM | M_1to8);
1410 put("vcvtdq2ps", ZMM_KZ, _ZMM | _MEM | M_1to16 | ZMM_ER);
1411
1412 put("vcvtpd2dq", XMM_KZ, _XMM | M_xword | M_1to2);
1413 put("vcvtpd2dq", XMM_KZ, _YMM | M_yword | MY_1to4);
1414 put("vcvtpd2dq", YMM_KZ, ZMM | _MEM | M_1to8 | ZMM_ER);
1415
1416 put("vcvtpd2ps", XMM_KZ, _XMM | M_xword | M_1to2);
1417 put("vcvtpd2ps", XMM_KZ, _YMM | M_yword | MY_1to4);
1418 put("vcvtpd2ps", YMM_KZ, ZMM | _MEM | M_1to8 | ZMM_ER);
1419
1420 put("vcvtpd2qq", XMM_KZ, _XMM | _MEM | M_1to2);
1421 put("vcvtpd2qq", YMM_KZ, _YMM | _MEM | M_1to4);
1422 put("vcvtpd2qq", ZMM_KZ, _ZMM | _MEM | M_1to8 | ZMM_ER);
1423
1424 put("vcvtpd2udq", XMM_KZ, _XMM | M_xword | M_1to2);
1425 put("vcvtpd2udq", XMM_KZ, _YMM | M_yword | MY_1to4);
1426 put("vcvtpd2udq", YMM_KZ, ZMM | _MEM | M_1to8 | ZMM_ER);
1427
1428 put("vcvtpd2uqq", XMM_KZ, _XMM | _MEM | M_1to2);
1429 put("vcvtpd2uqq", YMM_KZ, _YMM | _MEM | M_1to4);
1430 put("vcvtpd2uqq", ZMM_KZ, _ZMM | _MEM | M_1to8 | ZMM_ER);
1431
1432 put("vcvtph2ps", XMM_KZ, _XMM | _MEM);
1433 put("vcvtph2ps", YMM_KZ, _XMM | _MEM);
1434 put("vcvtph2ps", ZMM_KZ, _YMM | _MEM | YMM_SAE);
1435
1436 put("vcvtps2ph", XMM_KZ | _MEM, _XMM, IMM8);
1437 put("vcvtps2ph", XMM_KZ | _MEM, _YMM, IMM8);
1438 put("vcvtps2ph", YMM_KZ | _MEM, _ZMM, IMM8);
1439 put("vcvtps2ph", YMM_KZ, ZMM_SAE, IMM8);
1440
1441 put("vcvtps2dq", XMM_KZ, _XMM | _MEM | M_1to4);
1442 put("vcvtps2dq", YMM_KZ, _YMM | _MEM | M_1to8);
1443 put("vcvtps2dq", ZMM_KZ, _ZMM | _MEM | M_1to16 | ZMM_ER);
1444
1445 put("vcvtps2udq", XMM_KZ, _XMM | M_1to4);
1446 put("vcvtps2udq", YMM_KZ, _YMM | M_1to8);
1447 put("vcvtps2udq", ZMM_KZ, _ZMM | _MEM | M_1to16 | ZMM_ER);
1448
1449 put("vcvtps2qq", XMM_KZ, _XMM | _MEM | M_1to2);
1450 put("vcvtps2qq", YMM_KZ, _XMM | _MEM | M_1to4);
1451 put("vcvtps2qq", ZMM_KZ, _YMM | _MEM | M_1to8 | YMM_ER);
1452
1453 put("vcvtps2uqq", XMM_KZ, _XMM | _MEM | M_1to2);
1454 put("vcvtps2uqq", YMM_KZ, _XMM | _MEM | M_1to4);
1455 put("vcvtps2uqq", ZMM_KZ, _YMM | _MEM | M_1to8 | YMM_ER);
1456
1457 put("vcvtps2pd", XMM_KZ, _XMM | _MEM | M_1to2);
1458 put("vcvtps2pd", YMM_KZ, _XMM | _MEM | M_1to4);
1459 put("vcvtps2pd", ZMM_KZ, _YMM | _MEM | M_1to8 | YMM_SAE);
1460
1461 put("vcvtqq2pd", XMM_KZ, _XMM | _MEM | M_1to2);
1462 put("vcvtqq2pd", YMM_KZ, _YMM | _MEM | M_1to4);
1463 put("vcvtqq2pd", ZMM_KZ, _ZMM | _MEM | M_1to8 | ZMM_ER);
1464
1465 put("vcvtqq2ps", XMM_KZ, _XMM | M_xword | M_1to2);
1466 put("vcvtqq2ps", XMM_KZ, _YMM | M_yword | MY_1to4);
1467 put("vcvtqq2ps", YMM_KZ, ZMM | _MEM | M_1to8 | ZMM_ER);
1468
1469 put("vcvtsd2si", REG32 | REG64, _XMM3 | _MEM | XMM_ER);
1470
1471 put("vcvtsd2usi", REG32 | REG64, _XMM3 | _MEM | XMM_ER);
1472
1473 put("vcvtsd2ss", XMM_KZ, _XMM3, _XMM3 | _MEM | XMM_ER);
1474
1475 put("vcvtsi2sd", _XMM3, _XMM3, REG32 | REG64 | MEM32 | MEM64);
1476 put("vcvtsi2sd", XMM, XMM_ER, REG64);
1477
1478 put("vcvtsi2ss", _XMM3, _XMM3, REG32 | REG64 | MEM32 | MEM64);
1479 put("vcvtsi2ss", XMM, XMM_ER, REG32 | REG64);
1480
1481 put("vcvtss2sd", XMM_KZ, _XMM3, _XMM3 | _MEM | XMM_SAE);
1482
1483 put("vcvtss2si", REG32 | REG64, _XMM3 | _MEM | XMM_ER);
1484
1485 put("vcvtss2usi", REG32 | REG64, _XMM3 | _MEM | XMM_ER);
1486
1487 put("vcvtpd2dq", XMM_KZ, _XMM | M_xword | M_1to2);
1488 put("vcvtpd2dq", XMM_KZ, _YMM | M_yword | MY_1to4);
1489 put("vcvtpd2dq", YMM_KZ, _ZMM | _MEM | M_1to8 | ZMM_ER);
1490
1491 put("vcvttpd2qq", XMM_KZ, _XMM | _MEM | M_1to2);
1492 put("vcvttpd2qq", YMM_KZ, _YMM | _MEM | M_1to4);
1493 put("vcvttpd2qq", ZMM_KZ, _ZMM | _MEM | M_1to8 | ZMM_SAE);
1494
1495 put("vcvttpd2udq", XMM_KZ, _XMM | M_xword | M_1to2);
1496 put("vcvttpd2udq", XMM_KZ, _YMM | M_yword | MY_1to4);
1497 put("vcvttpd2udq", YMM_KZ, ZMM | _MEM | M_1to8 | ZMM_SAE);
1498
1499 put("vcvttpd2uqq", XMM_KZ, _XMM | _MEM | M_1to2);
1500 put("vcvttpd2uqq", YMM_KZ, _YMM | _MEM | M_1to4);
1501 put("vcvttpd2uqq", ZMM_KZ, _ZMM | _MEM | M_1to8 | ZMM_SAE);
1502
1503 put("vcvttps2dq", XMM_KZ, _XMM | _MEM | M_1to4);
1504 put("vcvttps2dq", YMM_KZ, _YMM | _MEM | M_1to8);
1505 put("vcvttps2dq", ZMM_KZ, _ZMM | _MEM | M_1to16 | ZMM_SAE);
1506
1507 put("vcvttps2udq", XMM_KZ, _XMM | M_1to4);
1508 put("vcvttps2udq", YMM_KZ, _YMM | M_1to8);
1509 put("vcvttps2udq", ZMM_KZ, _ZMM | _MEM | M_1to16 | ZMM_SAE);
1510
1511 put("vcvttps2qq", XMM_KZ, _XMM | _MEM | M_1to2);
1512 put("vcvttps2qq", YMM_KZ, _XMM | _MEM | M_1to4);
1513 put("vcvttps2qq", ZMM_KZ, _YMM | _MEM | M_1to8 | YMM_SAE);
1514
1515 put("vcvttps2uqq", XMM_KZ, _XMM | _MEM | M_1to2);
1516 put("vcvttps2uqq", YMM_KZ, _XMM | _MEM | M_1to4);
1517 put("vcvttps2uqq", ZMM_KZ, _YMM | _MEM | M_1to8 | YMM_SAE);
1518
1519 put("vcvttsd2si", REG32 | REG64, _XMM3 | _MEM | XMM_SAE);
1520
1521 put("vcvttsd2usi", REG32 | REG64, _XMM3 | _MEM | XMM_SAE);
1522
1523 put("vcvttss2si", REG32 | REG64, _XMM3 | _MEM | XMM_SAE);
1524
1525 put("vcvttss2usi", REG32 | REG64, _XMM3 | _MEM | XMM_SAE);
1526
1527 put("vcvtudq2pd", XMM_KZ, _XMM | _MEM | M_1to2);
1528 put("vcvtudq2pd", YMM_KZ, _XMM | _MEM | M_1to4);
1529 put("vcvtudq2pd", ZMM_KZ, _YMM | _MEM | M_1to8);
1530
1531 put("vcvtudq2ps", XMM_KZ, _XMM | _MEM | M_1to4);
1532 put("vcvtudq2ps", YMM_KZ, _YMM | _MEM | M_1to8);
1533 put("vcvtudq2ps", ZMM_KZ, _ZMM | _MEM | M_1to16 | ZMM_ER);
1534
1535 put("vcvtuqq2pd", XMM_KZ, _XMM | _MEM | M_1to2);
1536 put("vcvtuqq2pd", YMM_KZ, _YMM | _MEM | M_1to4);
1537 put("vcvtuqq2pd", ZMM_KZ, _ZMM | _MEM | M_1to8 | ZMM_ER);
1538
1539 put("vcvtuqq2ps", XMM_KZ, _XMM | M_xword | M_1to2);
1540 put("vcvtuqq2ps", XMM_KZ, _YMM | M_yword | MY_1to4);
1541 put("vcvtuqq2ps", YMM_KZ, ZMM | _MEM | M_1to8 | ZMM_ER);
1542
1543 put("vcvtusi2sd", _XMM3, _XMM3, REG32 | REG64 | MEM32 | MEM64);
1544 put("vcvtusi2sd", XMM, XMM_ER, REG64);
1545
1546 put("vcvtusi2ss", _XMM3, _XMM3, REG32 | REG64 | MEM32 | MEM64);
1547 put("vcvtusi2ss", XMM, XMM_ER, REG32 | REG64);
1548#endif
1549 }
1550 enum {
1553 xx_xy_yz
1556 {
1557#ifdef XBYAK64
1558 const struct Tbl {
1559 const char *name;
1560 int mode;
1561 } tbl[] = {
1562 { "vpgatherdd", xx_yy_zz },
1563 { "vpgatherdq", xx_yx_zy },
1564 { "vpgatherqd", xx_xy_yz },
1565 { "vpgatherqq", xx_yy_zz },
1566 { "vgatherdps", xx_yy_zz },
1567 { "vgatherdpd", xx_yx_zy },
1568 { "vgatherqps", xx_xy_yz },
1569 { "vgatherqpd", xx_yy_zz },
1570 };
1571 for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
1572 const Tbl& p = tbl[i];
1573 switch (p.mode) {
1574 case xx_yy_zz:
1575 put(p.name, XMM_K, VM32X);
1576 put(p.name, YMM_K, VM32Y);
1577 put(p.name, ZMM_K, VM32Z);
1578 break;
1579 case xx_yx_zy:
1580 put(p.name, XMM_K, VM32X);
1581 put(p.name, YMM_K, VM32X);
1582 put(p.name, ZMM_K, VM32Y);
1583 break;
1584 case xx_xy_yz:
1585 put(p.name, XMM_K, VM32X);
1586 put(p.name, XMM_K, VM32Y);
1587 put(p.name, YMM_K, VM32Z);
1588 break;
1589 }
1590 }
1591#endif
1592 }
1594 {
1595#ifdef XBYAK64
1596 const struct Tbl {
1597 const char *name;
1598 int mode;
1599 } tbl[] = {
1600 { "vpscatterdd", xx_yy_zz },
1601 { "vpscatterdq", xx_xy_yz },
1602 { "vpscatterqd", xx_yx_zy },
1603 { "vpscatterqq", xx_yy_zz },
1604
1605 { "vscatterdps", xx_yy_zz },
1606 { "vscatterdpd", xx_xy_yz },
1607 { "vscatterqps", xx_yx_zy },
1608 { "vscatterqpd", xx_yy_zz },
1609 };
1610 for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
1611 const Tbl& p = tbl[i];
1612 switch (p.mode) {
1613 case xx_yy_zz:
1614 put(p.name, VM32X_K, _XMM);
1615 put(p.name, VM32Y_K, _YMM);
1616 put(p.name, VM32Z_K, _ZMM);
1617 break;
1618 case xx_yx_zy:
1619 put(p.name, VM32X_K, _XMM);
1620 put(p.name, VM32Y_K, _XMM);
1621 put(p.name, VM32Z_K, _YMM);
1622 break;
1623 case xx_xy_yz:
1624 put(p.name, VM32X_K, _XMM);
1625 put(p.name, VM32X_K, _YMM);
1626 put(p.name, VM32Y_K, _ZMM);
1627 break;
1628 }
1629 }
1630#endif
1631 }
1633 {
1634 put("vblendmpd", XMM_KZ, _XMM, _XMM | _MEM | M_1to2);
1635 put("vblendmpd", YMM_KZ, _YMM, _YMM | _MEM | M_1to4);
1636 put("vblendmpd", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to8);
1637
1638 put("vblendmps", XMM_KZ, _XMM, _XMM | _MEM | M_1to4);
1639 put("vblendmps", YMM_KZ, _YMM, _YMM | _MEM | M_1to8);
1640 put("vblendmps", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to16);
1641
1642 put("vpblendmb", XMM_KZ, _XMM, _XMM | _MEM);
1643 put("vpblendmb", YMM_KZ, _YMM, _YMM | _MEM);
1644 put("vpblendmb", ZMM_KZ, _ZMM, _ZMM | _MEM);
1645
1646 put("vpblendmb", XMM_KZ, _XMM, _XMM | _MEM);
1647 put("vpblendmb", YMM_KZ, _YMM, _YMM | _MEM);
1648 put("vpblendmb", ZMM_KZ, _ZMM, _ZMM | _MEM);
1649
1650 put("vpblendmd", XMM_KZ, _XMM, _XMM | _MEM | M_1to4);
1651 put("vpblendmd", YMM_KZ, _YMM, _YMM | _MEM | M_1to8);
1652 put("vpblendmd", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to16);
1653
1654 put("vpblendmq", XMM_KZ, _XMM, _XMM | _MEM | M_1to2);
1655 put("vpblendmq", YMM_KZ, _YMM, _YMM | _MEM | M_1to4);
1656 put("vpblendmq", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to8);
1657 }
1659 {
1660 const uint64_t b0Tbl[] = { 0, 0, 0 };
1661 const uint64_t b4Tbl[] = { M_1to4, M_1to8, M_1to16 };
1662 const uint64_t b2Tbl[] = { M_1to2, M_1to4, M_1to8 };
1663 const struct Tbl {
1664 const char *name;
1665 uint64_t b;
1666 } tbl[] = {
1667 { "vpcmpb", 0 },
1668 { "vpcmpub", 0 },
1669 { "vpcmpw", 0 },
1670 { "vpcmpuw", 0 },
1671 { "vpcmpd", M_1to4 },
1672 { "vpcmpud", M_1to4 },
1673 { "vpcmpq", M_1to2 },
1674 { "vpcmpuq", M_1to2 },
1675 };
1676 for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
1677 const Tbl& p = tbl[i];
1678 const uint64_t *bTbl = p.b == 0 ? b0Tbl : p.b == M_1to4 ? b4Tbl : b2Tbl;
1679 put(p.name, K_K, _XMM, _XMM | _MEM | bTbl[0], IMM8);
1680 put(p.name, K_K, _YMM, _YMM | _MEM | bTbl[1], IMM8);
1681 put(p.name, K_K, _ZMM, _ZMM | _MEM | bTbl[2], IMM8);
1682 }
1683 }
1685 {
1686 const uint64_t b0Tbl[] = { 0, 0, 0 };
1687 const uint64_t b4Tbl[] = { M_1to4, M_1to8, M_1to16 };
1688 const uint64_t b2Tbl[] = { M_1to2, M_1to4, M_1to8 };
1689 const struct Tbl {
1690 const char *name;
1691 uint64_t b;
1692 } tbl[] = {
1693 { "vptestmb", 0 },
1694 { "vptestmw", 0 },
1695 { "vptestmd", M_1to4 },
1696 { "vptestmq", M_1to2 },
1697
1698 { "vptestnmb", 0 },
1699 { "vptestnmw", 0 },
1700 { "vptestnmd", M_1to4 },
1701 { "vptestnmq", M_1to2 },
1702 };
1703 for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
1704 const Tbl& p = tbl[i];
1705 const uint64_t *bTbl = p.b == 0 ? b0Tbl : p.b == M_1to4 ? b4Tbl : b2Tbl;
1706 put(p.name, K_K, _XMM, _XMM | _MEM | bTbl[0]);
1707 put(p.name, K_K, _YMM, _YMM | _MEM | bTbl[1]);
1708 put(p.name, K_K, _ZMM, _ZMM | _MEM | bTbl[2]);
1709 }
1710 }
1712 {
1713 {
1714 const char *tbl[] = {
1715 "vcompresspd",
1716 "vcompressps",
1717 "vpcompressd",
1718 "vpcompressq",
1719 };
1720 for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
1721 const char *name = tbl[i];
1722 put(name, XMM_KZ | _MEM, _XMM);
1723 put(name, YMM_KZ | _MEM, _YMM);
1724 put(name, ZMM_KZ | _MEM, _ZMM);
1725 }
1726 }
1727 {
1728 const char *tbl[] = {
1729 "vexpandpd",
1730 "vexpandps",
1731 "vpexpandd",
1732 "vpexpandq",
1733 };
1734 for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
1735 const char *name = tbl[i];
1736 put(name, XMM_KZ, _XMM | _MEM);
1737 put(name, YMM_KZ, _YMM | _MEM);
1738 put(name, ZMM_KZ, _ZMM | _MEM);
1739 }
1740 }
1741 }
1742 void putPerm()
1743 {
1744 const uint64_t b0Tbl[] = { 0, 0, 0 };
1745 const uint64_t b4Tbl[] = { M_1to4, M_1to8, M_1to16 };
1746 const uint64_t b2Tbl[] = { M_1to2, M_1to4, M_1to8 };
1747 const struct Tbl {
1748 const char *name;
1749 uint64_t b;
1750 } tbl[] = {
1751 { "vpermt2b", 0 },
1752 { "vpermt2w", 0 },
1753 { "vpermt2d", M_1to4 },
1754 { "vpermt2q", M_1to2 },
1755 { "vpermt2ps", M_1to4 },
1756 { "vpermt2pd", M_1to2 },
1757
1758 { "vpermi2b", 0 },
1759 { "vpermi2w", 0 },
1760 { "vpermi2d", M_1to4 },
1761 { "vpermi2q", M_1to2 },
1762 { "vpermi2ps", M_1to4 },
1763 };
1764 for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
1765 const Tbl& p = tbl[i];
1766 const uint64_t *bTbl = p.b == 0 ? b0Tbl : p.b == M_1to4 ? b4Tbl : b2Tbl;
1767 put(p.name, XMM_KZ, _XMM, _XMM | _MEM | bTbl[0]);
1768 put(p.name, YMM_KZ, _YMM, _YMM | _MEM | bTbl[1]);
1769 put(p.name, ZMM_KZ, _ZMM, _ZMM | _MEM | bTbl[2]);
1770 }
1771 }
1773 {
1774 put("vshuff32x4", YMM_KZ, _YMM, _YMM | _MEM | M_1to8, IMM8);
1775 put("vshuff32x4", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to16, IMM8);
1776
1777 put("vshuff64x2", YMM_KZ, _YMM, _YMM | _MEM | M_1to4, IMM8);
1778 put("vshuff64x2", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to8, IMM8);
1779
1780 put("vshufi32x4", YMM_KZ, _YMM, _YMM | _MEM | M_1to8, IMM8);
1781 put("vshufi32x4", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to16, IMM8);
1782
1783 put("vshufi64x2", YMM_KZ, _YMM, _YMM | _MEM | M_1to4, IMM8);
1784 put("vshufi64x2", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to8, IMM8);
1785 }
1786 void putMov()
1787 {
1788 put("vpmovm2b", _XMM | _YMM | _ZMM, K);
1789 put("vpmovm2w", _XMM | _YMM | _ZMM, K);
1790 put("vpmovm2d", _XMM | _YMM | _ZMM, K);
1791 put("vpmovm2q", _XMM | _YMM | _ZMM, K);
1792
1793 put("vpmovb2m", K, _XMM | _YMM | _ZMM);
1794 put("vpmovw2m", K, _XMM | _YMM | _ZMM);
1795 put("vpmovd2m", K, _XMM | _YMM | _ZMM);
1796 put("vpmovq2m", K, _XMM | _YMM | _ZMM);
1797
1798 put("vpmovqb", XMM_KZ | _MEM, _XMM | _YMM | _ZMM);
1799 put("vpmovsqb", XMM_KZ | _MEM, _XMM | _YMM | _ZMM);
1800 put("vpmovusqb", XMM_KZ | _MEM, _XMM | _YMM | _ZMM);
1801
1802 put("vpmovqw", XMM_KZ | _MEM, _XMM | _YMM | _ZMM);
1803 put("vpmovsqw", XMM_KZ | _MEM, _XMM | _YMM | _ZMM);
1804 put("vpmovusqw", XMM_KZ | _MEM, _XMM | _YMM | _ZMM);
1805
1806 put("vpmovqd", XMM_KZ | _MEM, _XMM | _YMM);
1807 put("vpmovqd", YMM_KZ | _MEM, _ZMM);
1808
1809 put("vpmovsqd", XMM_KZ | _MEM, _XMM | _YMM);
1810 put("vpmovsqd", YMM_KZ | _MEM, _ZMM);
1811
1812 put("vpmovusqd", XMM_KZ | _MEM, _XMM | _YMM);
1813 put("vpmovusqd", YMM_KZ | _MEM, _ZMM);
1814
1815 put("vpmovdb", XMM_KZ | _MEM, _XMM | _YMM | _ZMM);
1816 put("vpmovsdb", XMM_KZ | _MEM, _XMM | _YMM | _ZMM);
1817 put("vpmovusdb", XMM_KZ | _MEM, _XMM | _YMM | _ZMM);
1818
1819 put("vpmovdw", XMM_KZ | _MEM, _XMM | _YMM);
1820 put("vpmovdw", YMM_KZ | _MEM, _ZMM);
1821
1822 put("vpmovsdw", XMM_KZ | _MEM, _XMM | _YMM);
1823 put("vpmovsdw", YMM_KZ | _MEM, _ZMM);
1824
1825 put("vpmovusdw", XMM_KZ | _MEM, _XMM | _YMM);
1826 put("vpmovusdw", YMM_KZ | _MEM, _ZMM);
1827
1828 put("vpmovwb", XMM_KZ | _MEM, _XMM | _YMM);
1829 put("vpmovwb", YMM_KZ | _MEM, _ZMM);
1830
1831 put("vpmovswb", XMM_KZ | _MEM, _XMM | _YMM);
1832 put("vpmovswb", YMM_KZ | _MEM, _ZMM);
1833
1834 put("vpmovuswb", XMM_KZ | _MEM, _XMM | _YMM);
1835 put("vpmovuswb", YMM_KZ | _MEM, _ZMM);
1836 }
1837 void putRot()
1838 {
1839 put("vprolvd", XMM_KZ, _XMM, _XMM | _MEM | M_1to4);
1840 put("vprolvd", YMM_KZ, _YMM, _YMM | _MEM | M_1to8);
1841 put("vprolvd", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to16);
1842
1843 put("vprolvq", XMM_KZ, _XMM, _XMM | _MEM | M_1to2);
1844 put("vprolvq", YMM_KZ, _YMM, _YMM | _MEM | M_1to4);
1845 put("vprolvq", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to8);
1846
1847 put("vprorvd", XMM_KZ, _XMM, _XMM | _MEM | M_1to4);
1848 put("vprorvd", YMM_KZ, _YMM, _YMM | _MEM | M_1to8);
1849 put("vprorvd", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to16);
1850
1851 put("vprorvq", XMM_KZ, _XMM, _XMM | _MEM | M_1to2);
1852 put("vprorvq", YMM_KZ, _YMM, _YMM | _MEM | M_1to4);
1853 put("vprorvq", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to8);
1854
1855 put("vprold", XMM_KZ, _XMM | _MEM | M_1to4, IMM8);
1856 put("vprold", YMM_KZ, _YMM | _MEM | M_1to8, IMM8);
1857 put("vprold", ZMM_KZ, _ZMM | _MEM | M_1to16, IMM8);
1858
1859 put("vprolq", XMM_KZ, _XMM | _MEM | M_1to2, IMM8);
1860 put("vprolq", YMM_KZ, _YMM | _MEM | M_1to4, IMM8);
1861 put("vprolq", ZMM_KZ, _ZMM | _MEM | M_1to8, IMM8);
1862
1863 put("vprord", XMM_KZ, _XMM | _MEM | M_1to4, IMM8);
1864 put("vprord", YMM_KZ, _YMM | _MEM | M_1to8, IMM8);
1865 put("vprord", ZMM_KZ, _ZMM | _MEM | M_1to16, IMM8);
1866
1867 put("vprorq", XMM_KZ, _XMM | _MEM | M_1to2, IMM8);
1868 put("vprorq", YMM_KZ, _YMM | _MEM | M_1to4, IMM8);
1869 put("vprorq", ZMM_KZ, _ZMM | _MEM | M_1to8, IMM8);
1870 }
1872 {
1873#ifdef XBYAK64
1874 put("vpternlogd", XMM_KZ, _XMM, _XMM | _MEM | M_1to4, IMM8);
1875 put("vpternlogd", YMM_KZ, _YMM, _YMM | _MEM | M_1to8, IMM8);
1876 put("vpternlogd", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to16, IMM8);
1877
1878 put("vpternlogq", XMM_KZ, _XMM, _XMM | _MEM | M_1to2, IMM8);
1879 put("vpternlogq", YMM_KZ, _YMM, _YMM | _MEM | M_1to4, IMM8);
1880 put("vpternlogq", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to8, IMM8);
1881
1882 put("vgetexppd", XMM_KZ, _XMM | MEM | M_1to2);
1883 put("vgetexppd", YMM_KZ, _YMM | MEM | M_1to4);
1884 put("vgetexppd", ZMM_KZ, _ZMM | MEM | M_1to8 | ZMM_SAE);
1885
1886 put("vgetexpps", XMM_KZ, _XMM | MEM | M_1to4);
1887 put("vgetexpps", YMM_KZ, _YMM | MEM | M_1to8);
1888 put("vgetexpps", ZMM_KZ, _ZMM | MEM | M_1to16 | ZMM_SAE);
1889
1890 put("vgetexpsd", XMM_KZ, _XMM, _XMM | _MEM | XMM_SAE);
1891 put("vgetexpss", XMM_KZ, _XMM, _XMM | _MEM | XMM_SAE);
1892
1893 put("vgetmantpd", XMM_KZ, _XMM | _MEM | M_1to2, IMM8);
1894 put("vgetmantpd", YMM_KZ, _YMM | _MEM | M_1to4, IMM8);
1895 put("vgetmantpd", ZMM_KZ, _ZMM | _MEM | M_1to8, IMM8);
1896
1897 put("vgetmantps", XMM_KZ, _XMM | _MEM | M_1to4, IMM8);
1898 put("vgetmantps", YMM_KZ, _YMM | _MEM | M_1to8, IMM8);
1899 put("vgetmantps", ZMM_KZ, _ZMM | _MEM | M_1to16, IMM8);
1900
1901 put("vgetmantsd", XMM_KZ, _XMM, _XMM | _MEM | XMM_SAE, IMM8);
1902 put("vgetmantss", XMM_KZ, _XMM, _XMM | _MEM | XMM_SAE, IMM8);
1903
1904 put("vfixupimmpd", XMM_KZ, _XMM, _XMM | _MEM | M_1to2, IMM8);
1905 put("vfixupimmpd", YMM_KZ, _YMM, _YMM | _MEM | M_1to4, IMM8);
1906 put("vfixupimmpd", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to8, IMM8);
1907
1908 put("vfixupimmps", XMM_KZ, _XMM, _XMM | _MEM | M_1to4, IMM8);
1909 put("vfixupimmps", YMM_KZ, _YMM, _YMM | _MEM | M_1to8, IMM8);
1910 put("vfixupimmps", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to16, IMM8);
1911
1912 put("vfixupimmsd", XMM_KZ, _XMM, _XMM | _MEM, IMM8);
1913 put("vfixupimmss", XMM_KZ, _XMM, _XMM | _MEM, IMM8);
1914
1915 put("vrcp14pd", XMM_KZ, _XMM | _MEM | M_1to2);
1916 put("vrcp14pd", YMM_KZ, _YMM | _MEM | M_1to4);
1917 put("vrcp14pd", ZMM_KZ, _ZMM | _MEM | M_1to8);
1918
1919 put("vrcp14ps", XMM_KZ, _XMM | _MEM | M_1to4);
1920 put("vrcp14ps", YMM_KZ, _YMM | _MEM | M_1to8);
1921 put("vrcp14ps", ZMM_KZ, _ZMM | _MEM | M_1to16);
1922
1923 put("vrcp14sd", XMM_KZ, _XMM, _XMM | _MEM);
1924
1925 put("vrcp14ss", XMM_KZ, _XMM, _XMM | _MEM);
1926
1927 put("vrsqrt14pd", XMM_KZ, _XMM | _MEM | M_1to2);
1928 put("vrsqrt14pd", YMM_KZ, _YMM | _MEM | M_1to4);
1929 put("vrsqrt14pd", ZMM_KZ, _ZMM | _MEM | M_1to8);
1930
1931 put("vrsqrt14ps", XMM_KZ, _XMM | _MEM | M_1to4);
1932 put("vrsqrt14ps", YMM_KZ, _YMM | _MEM | M_1to8);
1933 put("vrsqrt14ps", ZMM_KZ, _ZMM | _MEM | M_1to16);
1934
1935 put("vrsqrt14sd", XMM_KZ, _XMM, _XMM | _MEM);
1936
1937 put("vrsqrt14ss", XMM_KZ, _XMM, _XMM | _MEM);
1938
1939 put("vrndscalepd", XMM_KZ, _XMM | _MEM | M_1to2, IMM8);
1940 put("vrndscalepd", YMM_KZ, _YMM | _MEM | M_1to4, IMM8);
1941 put("vrndscalepd", ZMM_KZ, _ZMM | _MEM | M_1to8, IMM8);
1942
1943 put("vrndscaleps", XMM_KZ, _XMM | _MEM | M_1to4, IMM8);
1944 put("vrndscaleps", YMM_KZ, _YMM | _MEM | M_1to8, IMM8);
1945 put("vrndscaleps", ZMM_KZ, _ZMM | _MEM | M_1to16, IMM8);
1946
1947 put("vrndscalesd", XMM_KZ, _XMM, _XMM | _MEM, IMM8);
1948
1949 put("vrndscaless", XMM_KZ, _XMM, _XMM | _MEM, IMM8);
1950
1951 put("vscalefpd", XMM_KZ, _XMM, _XMM | _MEM | M_1to2);
1952 put("vscalefpd", YMM_KZ, _YMM, _YMM | _MEM | M_1to4);
1953 put("vscalefpd", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to8 | ZMM_ER);
1954
1955 put("vscalefps", XMM_KZ, _XMM, _XMM | _MEM | M_1to4);
1956 put("vscalefps", YMM_KZ, _YMM, _YMM | _MEM | M_1to8);
1957 put("vscalefps", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to16 | ZMM_ER);
1958
1959 put("vscalefsd", XMM_KZ, _XMM, _XMM | _MEM | XMM_ER);
1960 put("vscalefss", XMM_KZ, _XMM, _XMM | _MEM | XMM_ER);
1961
1962 put("vdbpsadbw", XMM_KZ, _XMM, _XMM | _MEM, IMM8);
1963 put("vdbpsadbw", YMM_KZ, _YMM, _YMM | _MEM, IMM8);
1964 put("vdbpsadbw", ZMM_KZ, _ZMM, _ZMM | _MEM, IMM8);
1965
1966 put("vpmultishiftqb", XMM_KZ, _XMM, _XMM | _MEM | M_1to2);
1967 put("vpmultishiftqb", YMM_KZ, _YMM, _YMM | _MEM | M_1to4);
1968 put("vpmultishiftqb", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to8);
1969
1970 put("vpconflictd", XMM_KZ, _XMM | _MEM | M_1to4);
1971 put("vpconflictd", YMM_KZ, _YMM | _MEM | M_1to8);
1972 put("vpconflictd", ZMM_KZ, _ZMM | _MEM | M_1to16);
1973
1974 put("vpconflictq", XMM_KZ, _XMM | _MEM | M_1to2);
1975 put("vpconflictq", YMM_KZ, _YMM | _MEM | M_1to4);
1976 put("vpconflictq", ZMM_KZ, _ZMM | _MEM | M_1to8);
1977
1978 put("vplzcntd", XMM_KZ, _XMM | _MEM | M_1to4);
1979 put("vplzcntd", YMM_KZ, _YMM | _MEM | M_1to8);
1980 put("vplzcntd", ZMM_KZ, _ZMM | _MEM | M_1to16);
1981
1982 put("vplzcntq", XMM_KZ, _XMM | _MEM | M_1to2);
1983 put("vplzcntq", YMM_KZ, _YMM | _MEM | M_1to4);
1984 put("vplzcntq", ZMM_KZ, _ZMM | _MEM | M_1to8);
1985
1986 put("vpbroadcastmb2q", _XMM | _YMM | _ZMM, K);
1987 put("vpbroadcastmw2d", _XMM | _YMM | _ZMM, K);
1988
1989 put("vexp2pd", ZMM_KZ, _ZMM | _MEM | M_1to8 | ZMM_SAE);
1990 put("vexp2ps", ZMM_KZ, _ZMM | _MEM | M_1to16 | ZMM_SAE);
1991
1992 put("vrcp28pd", ZMM_KZ, _ZMM | _MEM | M_1to8 | ZMM_SAE);
1993 put("vrcp28ps", ZMM_KZ, _ZMM | _MEM | M_1to16 | ZMM_SAE);
1994 put("vrcp28sd", XMM_KZ, _XMM, _XMM | _MEM | XMM_SAE);
1995 put("vrcp28ss", XMM_KZ, _XMM, _XMM | _MEM | XMM_SAE);
1996
1997 put("vrsqrt28pd", ZMM_KZ, _ZMM | _MEM | M_1to8 | ZMM_SAE);
1998 put("vrsqrt28ps", ZMM_KZ, _ZMM | _MEM | M_1to16 | ZMM_SAE);
1999 put("vrsqrt28sd", XMM_KZ, _XMM, _XMM | _MEM | XMM_SAE);
2000 put("vrsqrt28ss", XMM_KZ, _XMM, _XMM | _MEM | XMM_SAE);
2001
2002 put("vgatherpf0dps", VM32Z_K);
2003 put("vgatherpf0qps", VM32Z_K);
2004 put("vgatherpf0dpd", VM32Y_K);
2005 put("vgatherpf0qpd", VM32Z_K);
2006
2007 put("vgatherpf1dps", VM32Z_K);
2008 put("vgatherpf1qps", VM32Z_K);
2009 put("vgatherpf1dpd", VM32Y_K);
2010 put("vgatherpf1qpd", VM32Z_K);
2011
2012 put("vscatterpf0dps", VM32Z_K);
2013 put("vscatterpf0qps", VM32Z_K);
2014 put("vscatterpf0dpd", VM32Y_K);
2015 put("vscatterpf0qpd", VM32Z_K);
2016
2017 put("vscatterpf1dps", VM32Z_K);
2018 put("vscatterpf1qps", VM32Z_K);
2019 put("vscatterpf1dpd", VM32Y_K);
2020 put("vscatterpf1qpd", VM32Z_K);
2021
2022 put("vrangepd", XMM_KZ, _XMM, _XMM | _MEM | M_1to2, IMM8);
2023 put("vrangepd", YMM_KZ, _YMM, _YMM | _MEM | M_1to4, IMM8);
2024 put("vrangepd", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to8 | ZMM_SAE, IMM8);
2025
2026 put("vrangeps", XMM_KZ, _XMM, _XMM | _MEM | M_1to4, IMM8);
2027 put("vrangeps", YMM_KZ, _YMM, _YMM | _MEM | M_1to8, IMM8);
2028 put("vrangeps", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to16 | ZMM_SAE, IMM8);
2029
2030 put("vrangesd", XMM_KZ, _XMM, _XMM | _MEM | XMM_SAE, IMM8);
2031 put("vrangess", XMM_KZ, _XMM, _XMM | _MEM | XMM_SAE, IMM8);
2032
2033 put("vreducepd", XMM_KZ, _XMM | _MEM | M_1to2, IMM8);
2034 put("vreducepd", YMM_KZ, _YMM | _MEM | M_1to4, IMM8);
2035 put("vreducepd", ZMM_KZ, _ZMM | _MEM | M_1to8 | ZMM_SAE, IMM8);
2036
2037 put("vreduceps", XMM_KZ, _XMM | _MEM | M_1to4, IMM8);
2038 put("vreduceps", YMM_KZ, _YMM | _MEM | M_1to8, IMM8);
2039 put("vreduceps", ZMM_KZ, _ZMM | _MEM | M_1to16 | ZMM_SAE, IMM8);
2040
2041 put("vreducesd", XMM_KZ, _XMM, _XMM | _MEM | XMM_SAE, IMM8);
2042 put("vreducess", XMM_KZ, _XMM, _XMM | _MEM | XMM_SAE, IMM8);
2043
2044 put("vpmadd52luq", XMM_KZ, _XMM, _XMM | _MEM | M_1to2);
2045 put("vpmadd52luq", YMM_KZ, _YMM, _YMM | _MEM | M_1to4);
2046 put("vpmadd52luq", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to8);
2047
2048 put("vpmadd52huq", XMM_KZ, _XMM, _XMM | _MEM | M_1to2);
2049 put("vpmadd52huq", YMM_KZ, _YMM, _YMM | _MEM | M_1to4);
2050 put("vpmadd52huq", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to8);
2051#endif
2052 }
2053 void classSubMem(const char *nm, char x, bool broadcast, int size)
2054 {
2055 printf("%s ", nm);
2056 if (isXbyak_) {
2057 printf("(k5|k3, %cword%s [rax+64], 5);dump();\n", x, broadcast ? "_b" : "");
2058 } else {
2059 if (broadcast) {
2060 int d = x == 'x' ? 128 / size : x == 'y' ? 256 / size : 512 / size;
2061 printf("k5{k3}, [rax+64]{1to%d}, 5\n", d);
2062 } else {
2063 if (x == 'x') x = 'o'; // nasm
2064 printf("k5{k3}, %cword [rax+64], 5\n", x);
2065 }
2066 }
2067 }
2068 void putClassSub(const char *name, int size)
2069 {
2070 put(name, K_K, _XMM | _YMM | _ZMM, IMM8);
2071 for (int i = 0; i < 2; i++) {
2072 classSubMem(name, 'x', i == 0, size);
2073 classSubMem(name, 'y', i == 0, size);
2074 classSubMem(name, 'z', i == 0, size);
2075 }
2076 }
2078 {
2079#ifdef XBYAK64
2080 putClassSub("vfpclasspd", 64);
2081 putClassSub("vfpclassps", 32);
2082 put("vfpclasssd", K_K, _XMM | _MEM, IMM8);
2083 put("vfpclassss", K_K, _XMM | _MEM, IMM8);
2084#endif
2085 }
2086 void putMin()
2087 {
2088#ifdef XBYAK64
2089 put("vextractf32x4", XMM_KZ, _YMM, IMM8);
2090#endif
2091 }
2093 {
2094 {
2095 const int tbl[] = {
2096 -129, -128, -127, 0, 1, 64, 65, 127, 128
2097 };
2098 for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
2099 char xs[128], ns[128];
2100 int v = tbl[i];
2101 CYBOZU_SNPRINTF(xs, sizeof(xs), "xmm0, ptr[eax%+d]", v);
2102 CYBOZU_SNPRINTF(ns, sizeof(ns), "xmm0, [eax%+d]", v);
2103 put("vpbroadcastb", xs, ns);
2104 }
2105 }
2106 {
2107 const int tbl[] = {
2108 -1024, -512 -256, -128, -64, -32, -16, -8, -4, -2, -1,
2109 0, 1, 2, 4, 8, 16, 32, 64, 128, 256, 512
2110 };
2111 for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
2112 char xs[128], ns[128];
2113 int v = tbl[i];
2114 CYBOZU_SNPRINTF(xs, sizeof(xs), "zmm0, zmm1, ptr_b[eax%+d]", v);
2115 CYBOZU_SNPRINTF(ns, sizeof(ns), "zmm0, zmm1, [eax%+d]{1to16}", v);
2116 put("vaddps", xs, ns);
2117 }
2118 }
2119#ifdef XBYAK64
2120 put("vfmadd231ps", "zmm8, zmm31, ptr_b[r14+rbp-0x1e4]", "zmm8, zmm31, [r14+rbp-0x1e4]{1to16}");
2121#endif
2122 }
2124 {
2125#ifdef MIN_TEST
2126 putMin();
2127#else
2128 putOpmask();
2129 separateFunc();
2130 putCombi();
2131 separateFunc();
2132 putCmpK();
2133 separateFunc();
2134 putBroadcast();
2135 separateFunc();
2136 putAVX512_M_X();
2137 separateFunc();
2138 put_vmov();
2139 separateFunc();
2140 put512_X_XM();
2141 separateFunc();
2142 put512_X_X_XM();
2143 separateFunc();
2144 put512_X3();
2145 separateFunc();
2146 put512_X3_I();
2147 separateFunc();
2148 put512_FMA();
2149 separateFunc();
2150 put512_Y_XM();
2151 separateFunc();
2152 put512_AVX1();
2153 separateFunc();
2154 put512_cvt();
2155 separateFunc();
2156 putMisc1();
2157 separateFunc();
2158 putGather();
2159 separateFunc();
2160 putBlend();
2161 separateFunc();
2162 putVpcmp();
2163 separateFunc();
2164 putVtest();
2165 separateFunc();
2166 putCompExp();
2167 separateFunc();
2168 putPerm();
2169 separateFunc();
2170 putShuff();
2171 separateFunc();
2172 putMisc2();
2173 separateFunc();
2174 putMov();
2175 separateFunc();
2176 putRot();
2177 separateFunc();
2178 putScatter();
2179 separateFunc();
2180 putClass();
2181 putDisp8N();
2182#endif
2183 }
2184};
2185
2186int main(int argc, char *[])
2187{
2188 Test test(argc > 1);
2189 test.put();
2190}
#define CYBOZU_SNPRINTF(x, len,...)
Definition inttype.hpp:64
#define NUM_OF_ARRAY(x)
Definition bench.cpp:12
const mie::Vuint & p
Definition bn.cpp:27
std::string name
void putBroadcastSub(int idx, int disp)
Definition make_512.cpp:636
Test(bool isXbyak)
Definition make_512.cpp:422
void put512_AVX1()
void putMisc2()
void putCombi()
Definition make_512.cpp:541
void putBroadcast()
Definition make_512.cpp:653
~Test()
Definition make_512.cpp:443
void put512_X3()
Definition make_512.cpp:885
void put_vaddpd(const char *r1, const char *r2, const char *r3, int kIdx=0, bool z=false, int sae=0)
Definition make_512.cpp:524
void putMin()
void putMisc1()
Definition make_512.cpp:685
void putAVX512()
void putMov()
void put512_cvt()
void classSubMem(const char *nm, char x, bool broadcast, int size)
void putVtest()
void put512_X_XM()
Definition make_512.cpp:819
void put512_X3_I()
void putAVX512_M_X()
Definition make_512.cpp:750
void putGather()
void putBlend()
void separateFunc()
Definition make_512.cpp:435
void putClassSub(const char *name, int size)
void putVpcmp()
void put_vmov()
Definition make_512.cpp:768
void putDisp8N()
void putPerm()
void putCmpK()
Definition make_512.cpp:596
void putAVX1()
void put()
Definition make_512.cpp:457
void putShuff()
void put512_X_X_XM()
Definition make_512.cpp:867
void put512_FMA()
void putRot()
void put512_Y_XM()
void putOpmask()
Definition make_512.cpp:461
void putClass()
void putCompExp()
void putScatter()
@ xx_yx_zy
@ xx_yy_zz
@ xx_xy_yz
const struct Ptn tbl[]
const uint64 VM32X_64
Definition make_512.cpp:36
const uint64 VM32Y_64
Definition make_512.cpp:38
const uint64 REG32
Definition make_512.cpp:66
const uint64 REG16_2
Definition make_512.cpp:55
const uint64 XMM_SAE
Definition make_512.cpp:94
const uint64 NOPARA
Definition make_512.cpp:114
const uint64 XMM
Definition make_512.cpp:76
const uint64 YMM
Definition make_512.cpp:77
const uint64 M_xword
Definition make_512.cpp:110
const uint64 XMM_KZ
Definition make_512.cpp:100
const uint64 REG32e
Definition make_512.cpp:68
const uint64 _MEMe
Definition make_512.cpp:53
const uint64 MEM_ONLY_DISP
Definition make_512.cpp:32
const uint64 ZMM
Definition make_512.cpp:85
const uint64 IMM8
Definition make_512.cpp:18
const uint64 MEM
Definition make_512.cpp:70
const uint64 EAX
Definition make_512.cpp:16
const uint64 _REG64
Definition make_512.cpp:58
const uint64 M_yword
Definition make_512.cpp:111
const uint64 VM32Y_K
Definition make_512.cpp:73
const uint64 K
Definition make_512.cpp:78
const uint64 IMM_2
Definition make_512.cpp:74
const uint64 YMM_SAE
Definition make_512.cpp:12
const uint64 ZMM_K
Definition make_512.cpp:23
const uint64 M_1to8
Definition make_512.cpp:107
const uint64 _REG64_2
Definition make_512.cpp:59
const uint64 _MEM
Definition make_512.cpp:14
const uint64 VM32Z_K
Definition make_512.cpp:39
const uint64 _XMM2
Definition make_512.cpp:60
const uint64 REG32_2
Definition make_512.cpp:54
const uint64 IMM
Definition make_512.cpp:75
const uint64 MEM64
Definition make_512.cpp:71
const uint64 MEM8
Definition make_512.cpp:27
const uint64 _ZMM
Definition make_512.cpp:79
const uint64 VM32Y_32
Definition make_512.cpp:37
const uint64 AL
Definition make_512.cpp:25
const uint64 REG8_3
Definition make_512.cpp:57
const uint64 REG64
Definition make_512.cpp:65
const uint64 M_1to2
Definition make_512.cpp:105
const uint64 K2
Definition make_512.cpp:88
const uint64 ZMM_KZ
Definition make_512.cpp:102
const uint64 _REG8
Definition make_512.cpp:19
const uint64 REG16
Definition make_512.cpp:67
const uint64 VM32X
Definition make_512.cpp:62
const uint64 ZMM_ER
Definition make_512.cpp:90
const uint64 REG8
Definition make_512.cpp:69
const uint64 VM32X_32
Definition make_512.cpp:35
const uint64 AX
Definition make_512.cpp:24
const uint64 MY_1to4
Definition make_512.cpp:112
const uint64 _REG16
Definition make_512.cpp:20
const uint64 K_K
Definition make_512.cpp:31
const uint64 REG8_2
Definition make_512.cpp:56
const uint64 _ZMM2
Definition make_512.cpp:80
const uint64 YMM_ER
Definition make_512.cpp:72
const uint64 VM32Y
Definition make_512.cpp:63
const uint64 _YMM2
Definition make_512.cpp:61
const uint64 VM32X_K
Definition make_512.cpp:33
const uint64 XMM_K
Definition make_512.cpp:21
const uint64 _YMM
Definition make_512.cpp:34
const uint64 M_1to16
Definition make_512.cpp:108
const uint64 IMM32
Definition make_512.cpp:17
const uint64 _XMM
Definition make_512.cpp:13
const uint64 MEM32
Definition make_512.cpp:29
const uint64 IMM_1
Definition make_512.cpp:26
const uint64 YMM_K
Definition make_512.cpp:22
const uint64 _REG32
Definition make_512.cpp:15
const uint64 YMM_KZ
Definition make_512.cpp:101
const uint64 VM32Z
Definition make_512.cpp:30
const uint64 MEM_K
Definition make_512.cpp:104
const uint64 _YMM3
Definition make_512.cpp:86
const uint64 XMM_ER
Definition make_512.cpp:109
const int bitEnd
Definition make_512.cpp:10
const uint64 ZMM_SAE
Definition make_512.cpp:89
const uint64 M_1to4
Definition make_512.cpp:106
const uint64 MEM16
Definition make_512.cpp:28
Definition xbyak.h:104
const GenericPointer< typename T::ValueType > T2 T::AllocatorType & a
Definition pointer.h:1181
Xbyak::uint64 uint64
Definition quantize.cpp:51
const int N
Definition quantize.cpp:54
unsigned __int64 uint64_t
Definition stdint.h:136
int type definition and macros Copyright (C) 2008 Cybozu Labs, Inc., all rights reserved.
Xbyak ; JIT assembler for x86(IA32)/x64 by C++.
CK_ULONG d
char * s
uint16_t j