1#ifndef XBYAK_XBYAK_UTIL_H_ 
    2#define XBYAK_XBYAK_UTIL_H_ 
   13    #if (_MSC_VER < 1400) && defined(XBYAK32) 
   14        static inline __declspec(naked) 
void __cpuid(
int[4], 
int)
 
   36        #define __GNUC_PREREQ(major, minor) ((((__GNUC__) << 16) + (__GNUC_MINOR__)) >= (((major) << 16) + (minor))) 
   38    #if __GNUC_PREREQ(4, 3) && !defined(__APPLE__) 
   41        #if defined(__APPLE__) && defined(XBYAK32)  
   42            #define __cpuid(eaxIn, a, b, c, d) __asm__ __volatile__("pushl %%ebx\ncpuid\nmovl %%ebp, %%esi\npopl %%ebx" : "=a"(a), "=S"(b), "=c"(c), "=d"(d) : "0"(eaxIn)) 
   43            #define __cpuid_count(eaxIn, ecxIn, a, b, c, d) __asm__ __volatile__("pushl %%ebx\ncpuid\nmovl %%ebp, %%esi\npopl %%ebx" : "=a"(a), "=S"(b), "=c"(c), "=d"(d) : "0"(eaxIn), "2"(ecxIn)) 
   45            #define __cpuid(eaxIn, a, b, c, d) __asm__ __volatile__("cpuid\n" : "=a"(a), "=b"(b), "=c"(c), "=d"(d) : "0"(eaxIn)) 
   46            #define __cpuid_count(eaxIn, ecxIn, a, b, c, d) __asm__ __volatile__("cpuid\n" : "=a"(a), "=b"(b), "=c"(c), "=d"(d) : "0"(eaxIn), "2"(ecxIn)) 
   51namespace Xbyak { 
namespace util {
 
   58    unsigned int get32bitAsBE(
const char *x)
 const 
   60        return x[0] | (x[1] << 8) | (x[2] << 16) | (x[3] << 24);
 
   62    unsigned int mask(
int n)
 const 
   71        model = (data[0] >> 4) & mask(4);
 
   72        family = (data[0] >> 8) & mask(4);
 
   74        extModel = (data[0] >> 16) & mask(4);
 
   87    unsigned int extractBit(
unsigned int val, 
unsigned int base, 
unsigned int end)
 
   89        return (val >> base) & ((1u << (end - base)) - 1);
 
   91    void setCacheHierarchy()
 
   93        if ((type_ & 
tINTEL) == 0) 
return;
 
   94        const unsigned int NO_CACHE = 0;
 
   95        const unsigned int DATA_CACHE = 1;
 
   97        const unsigned int UNIFIED_CACHE = 3;
 
   98        unsigned int smt_width = 0;
 
   99        unsigned int n_cores = (
unsigned int) -1;
 
  100        unsigned int data[4];
 
  109            smt_width = data[1] & 0x7FFF;
 
  111            n_cores = data[1] & 0x7FFF;
 
  125            unsigned int cacheType = extractBit(data[0], 0, 4);
 
  126            if (cacheType == NO_CACHE) 
break;
 
  127            if (cacheType == DATA_CACHE || cacheType == UNIFIED_CACHE) {
 
  128                unsigned int nb_logical_cores = (std::min)(extractBit(data[0], 14, 25) + 1, n_cores);
 
  130                    (extractBit(data[1], 22, 31) + 1)
 
  131                    * (extractBit(data[1], 12, 21) + 1)
 
  132                    * (extractBit(data[1], 0, 11) + 1)
 
  134                if (cacheType == DATA_CACHE && smt_width == 0) smt_width = nb_logical_cores;
 
  135                assert(smt_width != 0);
 
  172    static inline void getCpuid(
unsigned int eaxIn, 
unsigned int data[4])
 
  175        __cpuid(
reinterpret_cast<int*
>(data), eaxIn);
 
  177        __cpuid(eaxIn, data[0], data[1], data[2], data[3]);
 
 
  180    static inline void getCpuidEx(
unsigned int eaxIn, 
unsigned int ecxIn, 
unsigned int data[4])
 
  183        __cpuidex(
reinterpret_cast<int*
>(data), eaxIn, ecxIn);
 
  185        __cpuid_count(eaxIn, ecxIn, data[0], data[1], data[2], data[3]);
 
 
  193        unsigned int eax, 
edx;
 
  196        __asm__ 
volatile(
".byte 0x0f, 0x01, 0xd0" : 
"=a"(eax), 
"=d"(
edx) : 
"c"(0));
 
 
  270        unsigned int data[4];
 
  271        const unsigned int& 
EAX = data[0];
 
  272        const unsigned int& EBX = data[1];
 
  273        const unsigned int& ECX = data[2];
 
  274        const unsigned int& EDX = data[3];
 
  276        const unsigned int maxNum = 
EAX;
 
  277        static const char intel[] = 
"ntel";
 
  278        static const char amd[] = 
"cAMD";
 
  279        if (ECX == get32bitAsBE(amd)) {
 
  282            if (EDX & (1U << 31)) type_ |= 
t3DN;
 
  283            if (EDX & (1U << 15)) type_ |= 
tCMOV;
 
  284            if (EDX & (1U << 30)) type_ |= 
tE3DN;
 
  285            if (EDX & (1U << 22)) type_ |= 
tMMX2;
 
  286            if (EDX & (1U << 27)) type_ |= 
tRDTSCP;
 
  288        if (ECX == get32bitAsBE(intel)) {
 
  291            if (EDX & (1U << 27)) type_ |= 
tRDTSCP;
 
  292            if (ECX & (1U << 5)) type_ |= 
tLZCNT;
 
  296        if (ECX & (1U << 0)) type_ |= 
tSSE3;
 
  297        if (ECX & (1U << 9)) type_ |= 
tSSSE3;
 
  298        if (ECX & (1U << 19)) type_ |= 
tSSE41;
 
  299        if (ECX & (1U << 20)) type_ |= 
tSSE42;
 
  300        if (ECX & (1U << 22)) type_ |= 
tMOVBE;
 
  301        if (ECX & (1U << 23)) type_ |= 
tPOPCNT;
 
  302        if (ECX & (1U << 25)) type_ |= 
tAESNI;
 
  304        if (ECX & (1U << 27)) type_ |= 
tOSXSAVE;
 
  305        if (ECX & (1U << 30)) type_ |= 
tRDRAND;
 
  306        if (ECX & (1U << 29)) type_ |= 
tF16C;
 
  308        if (EDX & (1U << 15)) type_ |= 
tCMOV;
 
  309        if (EDX & (1U << 23)) type_ |= 
tMMX;
 
  310        if (EDX & (1U << 25)) type_ |= 
tMMX2 | 
tSSE;
 
  311        if (EDX & (1U << 26)) type_ |= 
tSSE2;
 
  317                if (ECX & (1U << 28)) type_ |= 
tAVX;
 
  318                if (ECX & (1U << 12)) type_ |= 
tFMA;
 
  319                if (((bv >> 5) & 7) == 7) {
 
  321                    if (EBX & (1U << 16)) type_ |= 
tAVX512F;
 
  323                        if (EBX & (1U << 17)) type_ |= 
tAVX512DQ;
 
  325                        if (EBX & (1U << 26)) type_ |= 
tAVX512PF;
 
  326                        if (EBX & (1U << 27)) type_ |= 
tAVX512ER;
 
  327                        if (EBX & (1U << 28)) type_ |= 
tAVX512CD;
 
  328                        if (EBX & (1U << 30)) type_ |= 
tAVX512BW;
 
  329                        if (EBX & (1U << 31)) type_ |= 
tAVX512VL;
 
  332                        if (ECX & (1U << 8)) type_ |= 
tGFNI;
 
  333                        if (ECX & (1U << 9)) type_ |= 
tVAES;
 
  346            if (type_ & 
tAVX && (EBX & (1U << 5))) type_ |= 
tAVX2;
 
  347            if (EBX & (1U << 3)) type_ |= 
tBMI1;
 
  348            if (EBX & (1U << 8)) type_ |= 
tBMI2;
 
  350            if (EBX & (1U << 18)) type_ |= 
tRDSEED;
 
  351            if (EBX & (1U << 19)) type_ |= 
tADX;
 
  352            if (EBX & (1U << 20)) type_ |= 
tSMAP;
 
  353            if (EBX & (1U << 4)) type_ |= 
tHLE;
 
  354            if (EBX & (1U << 11)) type_ |= 
tRTM;
 
  355            if (EBX & (1U << 14)) type_ |= 
tMPX;
 
  356            if (EBX & (1U << 29)) type_ |= 
tSHA;
 
 
  364        printf(
"family=%d, model=%X, stepping=%d, extFamily=%d, extModel=%X\n",
 
 
  370        return (type & type_) != 0;
 
 
 
  381        unsigned int eax, 
edx;
 
  382        __asm__ 
volatile(
"rdtsc" : 
"=a"(eax), 
"=d"(
edx));
 
 
  402    void clear() { count_ = 0; clock_ = 0; }
 
 
  409const int UseRCX = 1 << 6;
 
  410const int UseRDX = 1 << 7;
 
  413    static const size_t maxTblNum = 10;
 
  414    const Xbyak::Reg64 *tbl_[maxTblNum];
 
  417    Pack() : tbl_(), n_(0) {}
 
  418    Pack(
const Xbyak::Reg64 *
tbl, 
size_t n) { 
init(
tbl, n); }
 
  419    Pack(
const Pack& rhs)
 
  422        for (
size_t i = 0; i < n_; i++) tbl_[i] = rhs.tbl_[i];
 
  424    Pack& operator=(
const Pack& rhs)
 
  427        for (
size_t i = 0; i < n_; i++) tbl_[i] = rhs.tbl_[i];
 
  430    Pack(
const Xbyak::Reg64& t0)
 
  431    { n_ = 1; tbl_[0] = &t0; }
 
  432    Pack(
const Xbyak::Reg64& t1, 
const Xbyak::Reg64& t0)
 
  433    { n_ = 2; tbl_[0] = &t0; tbl_[1] = &t1; }
 
  434    Pack(
const Xbyak::Reg64& t2, 
const Xbyak::Reg64& t1, 
const Xbyak::Reg64& t0)
 
  435    { n_ = 3; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; }
 
  436    Pack(
const Xbyak::Reg64& t3, 
const Xbyak::Reg64& t2, 
const Xbyak::Reg64& t1, 
const Xbyak::Reg64& t0)
 
  437    { n_ = 4; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; tbl_[3] = &t3; }
 
  438    Pack(
const Xbyak::Reg64& t4, 
const Xbyak::Reg64& t3, 
const Xbyak::Reg64& t2, 
const Xbyak::Reg64& t1, 
const Xbyak::Reg64& t0)
 
  439    { n_ = 5; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; tbl_[3] = &t3; tbl_[4] = &t4; }
 
  440    Pack(
const Xbyak::Reg64& t5, 
const Xbyak::Reg64& t4, 
const Xbyak::Reg64& t3, 
const Xbyak::Reg64& t2, 
const Xbyak::Reg64& t1, 
const Xbyak::Reg64& t0)
 
  441    { n_ = 6; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; tbl_[3] = &t3; tbl_[4] = &t4; tbl_[5] = &t5; }
 
  442    Pack(
const Xbyak::Reg64& t6, 
const Xbyak::Reg64& t5, 
const Xbyak::Reg64& t4, 
const Xbyak::Reg64& t3, 
const Xbyak::Reg64& t2, 
const Xbyak::Reg64& t1, 
const Xbyak::Reg64& t0)
 
  443    { n_ = 7; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; tbl_[3] = &t3; tbl_[4] = &t4; tbl_[5] = &t5; tbl_[6] = &t6; }
 
  444    Pack(
const Xbyak::Reg64& t7, 
const Xbyak::Reg64& t6, 
const Xbyak::Reg64& t5, 
const Xbyak::Reg64& t4, 
const Xbyak::Reg64& t3, 
const Xbyak::Reg64& t2, 
const Xbyak::Reg64& t1, 
const Xbyak::Reg64& t0)
 
  445    { n_ = 8; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; tbl_[3] = &t3; tbl_[4] = &t4; tbl_[5] = &t5; tbl_[6] = &t6; tbl_[7] = &t7; }
 
  446    Pack(
const Xbyak::Reg64& t8, 
const Xbyak::Reg64& t7, 
const Xbyak::Reg64& t6, 
const Xbyak::Reg64& t5, 
const Xbyak::Reg64& t4, 
const Xbyak::Reg64& t3, 
const Xbyak::Reg64& t2, 
const Xbyak::Reg64& t1, 
const Xbyak::Reg64& t0)
 
  447    { n_ = 9; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; tbl_[3] = &t3; tbl_[4] = &t4; tbl_[5] = &t5; tbl_[6] = &t6; tbl_[7] = &t7; tbl_[8] = &t8; }
 
  448    Pack(
const Xbyak::Reg64& t9, 
const Xbyak::Reg64& t8, 
const Xbyak::Reg64& t7, 
const Xbyak::Reg64& t6, 
const Xbyak::Reg64& t5, 
const Xbyak::Reg64& t4, 
const Xbyak::Reg64& t3, 
const Xbyak::Reg64& t2, 
const Xbyak::Reg64& t1, 
const Xbyak::Reg64& t0)
 
  449    { n_ = 10; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; tbl_[3] = &t3; tbl_[4] = &t4; tbl_[5] = &t5; tbl_[6] = &t6; tbl_[7] = &t7; tbl_[8] = &t8; tbl_[9] = &t9; }
 
  450    Pack& append(
const Xbyak::Reg64& t)
 
  452        if (n_ == maxTblNum) {
 
  453            fprintf(stderr, 
"ERR Pack::can't append\n");
 
  454            throw Error(ERR_BAD_PARAMETER);
 
  459    void init(
const Xbyak::Reg64 *
tbl, 
size_t n)
 
  462            fprintf(stderr, 
"ERR Pack::init bad n=%d\n", (
int)n);
 
  463            throw Error(ERR_BAD_PARAMETER);
 
  466        for (
size_t i = 0; i < n; i++) {
 
  470    const Xbyak::Reg64& operator[](
size_t n)
 const 
  473            fprintf(stderr, 
"ERR Pack bad n=%d\n", (
int)n);
 
  474            throw Error(ERR_BAD_PARAMETER);
 
  478    size_t size()
 const { 
return n_; }
 
  482    Pack 
sub(
size_t pos, 
size_t num = 
size_t(-1))
 const 
  484        if (num == 
size_t(-1)) 
num = n_ - pos;
 
  485        if (pos + num > n_) {
 
  486            fprintf(stderr, 
"ERR Pack::sub bad pos=%d, num=%d\n", (
int)pos, (
int)num);
 
  487            throw Error(ERR_BAD_PARAMETER);
 
  491        for (
size_t i = 0; i < 
num; i++) {
 
  492            pack.tbl_[i] = tbl_[pos + i];
 
  498        for (
size_t i = 0; i < n_; i++) {
 
  499            printf(
"%s ", tbl_[i]->toString());
 
  507    static const int noSaveNum = 6;
 
  508    static const int rcxPos = 0;
 
  509    static const int rdxPos = 1;
 
  511    static const int noSaveNum = 8;
 
  512    static const int rcxPos = 3;
 
  513    static const int rdxPos = 2;
 
  523    Xbyak::Reg64 pTbl_[4];
 
  524    Xbyak::Reg64 tTbl_[10];
 
  527    StackFrame(
const StackFrame&);
 
  528    void operator=(
const StackFrame&);
 
  548    StackFrame(
Xbyak::CodeGenerator *code, 
int pNum, 
int tNum = 0, 
int stackSizeByte = 0, 
bool makeEpilog = 
true)
 
  551        , tNum_(tNum & ~(UseRCX | UseRDX))
 
  552        , useRcx_((tNum & UseRCX) != 0)
 
  553        , useRdx_((tNum & UseRDX) != 0)
 
  556        , makeEpilog_(makeEpilog)
 
  560        using namespace Xbyak;
 
  561        if (pNum < 0 || pNum > 4) 
throw Error(ERR_BAD_PNUM);
 
  562        const int allRegNum = pNum + tNum_ + (useRcx_ ? 1 : 0) + (useRdx_ ? 1 : 0);
 
  563        if (allRegNum < pNum || allRegNum > 14) 
throw Error(ERR_BAD_TNUM);
 
  564        const Reg64& _rsp = code->rsp;
 
  566        saveNum_ = (std::max)(0, allRegNum - noSaveNum);
 
  567        const int *
tbl = getOrderTbl() + noSaveNum;
 
  568        P_ = saveNum_ + (stackSizeByte + 7) / 8;
 
  569        if (P_ > 0 && (P_ & 1) == 0) P_++; 
 
  571        if (P_ > 0) code->sub(_rsp, P_);
 
  573        for (
int i = 0; i < (std::min)(saveNum_, 4); i++) {
 
  574            code->mov(_ptr [_rsp + P_ + (i + 1) * 8], Reg64(
tbl[i]));
 
  576        for (
int i = 4; i < saveNum_; i++) {
 
  577            code->mov(_ptr [_rsp + P_ - 8 * (saveNum_ - i)], Reg64(
tbl[i]));
 
  580        for (
int i = 0; i < saveNum_; i++) {
 
  581            code->mov(_ptr [_rsp + P_ - 8 * (saveNum_ - i)], Reg64(
tbl[i]));
 
  585        for (
int i = 0; i < pNum; i++) {
 
  586            pTbl_[i] = Xbyak::Reg64(getRegIdx(pos));
 
  588        for (
int i = 0; i < tNum_; i++) {
 
  589            tTbl_[i] = Xbyak::Reg64(getRegIdx(pos));
 
  591        if (useRcx_ && rcxPos < pNum) code_->
mov(code_->r10, code_->rcx);
 
  592        if (useRdx_ && rdxPos < pNum) code_->
mov(code_->r11, code_->rdx);
 
  593        p_.init(pTbl_, pNum);
 
  594        t_.init(tTbl_, tNum_);
 
  600    void close(
bool callRet = 
true)
 
  602        using namespace Xbyak;
 
  603        const Reg64& _rsp = code_->rsp;
 
  605        const int *
tbl = getOrderTbl() + noSaveNum;
 
  607        for (
int i = 0; i < (std::min)(saveNum_, 4); i++) {
 
  608            code_->
mov(Reg64(
tbl[i]), _ptr [_rsp + P_ + (i + 1) * 8]);
 
  610        for (
int i = 4; i < saveNum_; i++) {
 
  611            code_->
mov(Reg64(
tbl[i]), _ptr [_rsp + P_ - 8 * (saveNum_ - i)]);
 
  614        for (
int i = 0; i < saveNum_; i++) {
 
  615            code_->
mov(Reg64(
tbl[i]), _ptr [_rsp + P_ - 8 * (saveNum_ - i)]);
 
  618        if (P_ > 0) code_->
add(_rsp, P_);
 
  620        if (callRet) code_->
ret();
 
  624        if (!makeEpilog_) 
return;
 
  627        } 
catch (std::exception& e) {
 
  628            printf(
"ERR:StackFrame %s\n", e.what());
 
  631            printf(
"ERR:StackFrame otherwise\n");
 
  636    const int *getOrderTbl()
 const 
  638        using namespace Xbyak;
 
  639        static const int tbl[] = {
 
  641            Operand::RCX, Operand::RDX, Operand::R8, Operand::R9, Operand::R10, Operand::R11, Operand::RDI, Operand::RSI,
 
  643            Operand::RDI, Operand::RSI, Operand::RDX, Operand::RCX, Operand::R8, Operand::R9, Operand::R10, Operand::R11,
 
  645            Operand::RBX, Operand::RBP, Operand::R12, Operand::R13, Operand::R14, Operand::R15
 
  649    int getRegIdx(
int& pos)
 const 
  652        using namespace Xbyak;
 
  653        const int *
tbl = getOrderTbl();
 
  656            if (
r == Operand::RCX) { 
return Operand::R10; }
 
  657            if (
r == Operand::R10) { 
r = 
tbl[pos++]; }
 
  660            if (
r == Operand::RDX) { 
return Operand::R11; }
 
  661            if (
r == Operand::R11) { 
return tbl[pos++]; }
 
void add(const Operand &op, uint32 imm)
void mov(const Operand ®1, const Operand ®2)
static const Type tAVX512PF
static const Type tRDRAND
static const unsigned int maxNumberCacheLevels
static const Type tPREFETCHW
unsigned int data_cache_size[maxNumberCacheLevels]
unsigned int cores_sharing_data_cache[maxNumberCacheLevels]
unsigned int getDataCacheLevels() const
static void getCpuid(unsigned int eaxIn, unsigned int data[4])
bool has(Type type) const
unsigned int getCoresSharingDataCache(unsigned int i) const
static const Type tAVX512_VBMI
static void getCpuidEx(unsigned int eaxIn, unsigned int ecxIn, unsigned int data[4])
static const Type tAVX512_VPOPCNTDQ
static const Type tAVX512_BITALG
static const Type tOSXSAVE
static const Type tPCLMULQDQ
static const Type tPREFETCHWT1
static const Type tAVX512CD
static const Type tVPCLMULQDQ
unsigned int data_cache_levels
static const Type tAVX512_4VNNIW
static const Type tAVX512_IFMA
static const Type tAVX512BW
static const Type tAVX512_VNNI
static const Type tAVX512ER
static uint64 getXfeature()
static const Type tAVX512IFMA
static const Type tRDSEED
static const Type tPOPCNT
static const Type tRDTSCP
static const Type tAVX512VBMI
unsigned int getDataCacheSize(unsigned int i) const
static const Type tENHANCED_REP
static const Type tAVX512DQ
static const Type tAVX512_VBMI2
static const Type tAVX512VL
static const Type tAVX512F
static const Type tAVX512_4FMAPS
void close(T *e, websocketpp::connection_hdl hdl)
LOGGING_API void printf(Category category, const char *format,...)
static const Reg32 esp(Operand::ESP)
static const Reg32 ecx(Operand::ECX)
static const Reg32 edx(Operand::EDX)
static const AddressFrame dword(32)
static const Reg32 ebx(Operand::EBX)
static const Reg32 esi(Operand::ESI)
void pack(instruction_stream *stream, uint32_t field)
Xbyak ; JIT assembler for x86(IA32)/x64 by C++.
void sub(const Operand &op, uint32 imm)
#define __cpuid(eaxIn, a, b, c, d)
#define __cpuid_count(eaxIn, ecxIn, a, b, c, d)