LCOV - code coverage report
Current view: top level - asmjit - x86internal.cpp (source / functions) Hit Total Coverage
Test: plumed test coverage (other modules) Lines: 113 394 28.7 %
Date: 2024-10-18 13:59:33 Functions: 6 14 42.9 %

          Line data    Source code
       1             : /* +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
       2             : Copyright (c) 2008-2017, Petr Kobalicek
       3             : 
       4             : This software is provided 'as-is', without any express or implied
       5             : warranty. In no event will the authors be held liable for any damages
       6             : arising from the use of this software.
       7             : 
       8             : Permission is granted to anyone to use this software for any purpose,
       9             : including commercial applications, and to alter it and redistribute it
      10             : freely, subject to the following restrictions:
      11             : 
      12             : 1. The origin of this software must not be misrepresented; you must not
      13             :    claim that you wrote the original software. If you use this software
      14             :    in a product, an acknowledgment in the product documentation would be
      15             :    appreciated but is not required.
      16             : 2. Altered source versions must be plainly marked as such, and must not be
      17             :    misrepresented as being the original software.
      18             : 3. This notice may not be removed or altered from any source distribution.
      19             : +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ */
      20             : #ifdef __PLUMED_HAS_ASMJIT
      21             : #pragma GCC diagnostic push
      22             : #pragma GCC diagnostic ignored "-Wpedantic"
      23             : // [AsmJit]
      24             : // Complete x86/x64 JIT and Remote Assembler for C++.
      25             : //
      26             : // [License]
      27             : // Zlib - See LICENSE.md file in the package.
      28             : 
      29             : // [Export]
      30             : #define ASMJIT_EXPORTS
      31             : 
      32             : // [Guard]
      33             : #include "./asmjit_build.h"
      34             : #if defined(ASMJIT_BUILD_X86)
      35             : 
      36             : // [Dependencies]
      37             : #include "./x86internal_p.h"
      38             : 
      39             : // [Api-Begin]
      40             : #include "./asmjit_apibegin.h"
      41             : 
      42             : namespace PLMD {
      43             : namespace asmjit {
      44             : 
      45             : // ============================================================================
      46             : // [asmjit::X86Internal - Helpers]
      47             : // ============================================================================
      48             : 
      49             : static ASMJIT_INLINE uint32_t x86GetXmmMovInst(const FuncFrameLayout& layout) {
      50             :   bool avx = layout.isAvxEnabled();
      51             :   bool aligned = layout.hasAlignedVecSR();
      52             : 
      53           0 :   return aligned ? (avx ? X86Inst::kIdVmovaps : X86Inst::kIdMovaps)
      54             :                  : (avx ? X86Inst::kIdVmovups : X86Inst::kIdMovups);
      55             : }
      56             : 
      57             : static ASMJIT_INLINE uint32_t x86VecTypeIdToRegType(uint32_t typeId) noexcept {
      58       12828 :   return typeId <= TypeId::_kVec128End ? X86Reg::kRegXmm :
      59             :          typeId <= TypeId::_kVec256End ? X86Reg::kRegYmm :
      60             :                                          X86Reg::kRegZmm ;
      61             : }
      62             : 
      63             : // ============================================================================
      64             : // [asmjit::X86FuncArgsContext]
      65             : // ============================================================================
      66             : 
      67             : // Used by both, `Utils::argsToFrameInfo()` and `Utils::allocArgs()`.
      68             : class X86FuncArgsContext {
      69             : public:
      70             :   typedef FuncDetail::Value SrcArg;
      71             :   typedef FuncArgsMapper::Value DstArg;
      72             : 
      73             :   enum { kMaxVRegKinds = Globals::kMaxVRegKinds };
      74             : 
      75             :   struct WorkData {
      76             :     uint32_t archRegs;                   //!< Architecture provided and allocable regs.
      77             :     uint32_t workRegs;                   //!< Registers that can be used by shuffler.
      78             :     uint32_t usedRegs;                   //!< Only registers used to pass arguments.
      79             :     uint32_t srcRegs;                    //!< Source registers that need shuffling.
      80             :     uint32_t dstRegs;                    //!< Destination registers that need shuffling.
      81             :     uint8_t numOps;                      //!< Number of operations to finish.
      82             :     uint8_t numSwaps;                    //!< Number of register swaps.
      83             :     uint8_t numStackArgs;                //!< Number of stack loads.
      84             :     uint8_t reserved[9];                 //!< Reserved (only used as padding).
      85             :     uint8_t argIndex[32];                //!< Only valid if a corresponding bit in `userRegs` is true.
      86             :   };
      87             : 
      88             :   X86FuncArgsContext() noexcept;
      89             :   Error initWorkData(const FuncArgsMapper& args, const uint32_t* dirtyRegs, bool preservedFP) noexcept;
      90             : 
      91             :   Error markRegsForSwaps(FuncFrameInfo& ffi) noexcept;
      92             :   Error markDstRegsDirty(FuncFrameInfo& ffi) noexcept;
      93             :   Error markStackArgsReg(FuncFrameInfo& ffi) noexcept;
      94             : 
      95             :   // --------------------------------------------------------------------------
      96             :   // [Members]
      97             :   // --------------------------------------------------------------------------
      98             : 
      99             :   WorkData _workData[kMaxVRegKinds];
     100             :   bool _hasStackArgs;
     101             :   bool _hasRegSwaps;
     102             : };
     103             : 
     104           0 : X86FuncArgsContext::X86FuncArgsContext() noexcept {
     105           0 :   ::memset(_workData, 0, sizeof(_workData));
     106           0 :   _hasStackArgs = false;
     107           0 :   _hasRegSwaps = false;
     108           0 : }
     109             : 
     110           0 : ASMJIT_FAVOR_SIZE Error X86FuncArgsContext::initWorkData(const FuncArgsMapper& args, const uint32_t* dirtyRegs, bool preservedFP) noexcept {
     111             :   // This code has to be updated if this changes.
     112             :   ASMJIT_ASSERT(kMaxVRegKinds == 4);
     113             : 
     114             :   uint32_t i;
     115             :   const FuncDetail& func = *args.getFuncDetail();
     116             : 
     117             :   uint32_t archType = func.getCallConv().getArchType();
     118           0 :   uint32_t count = (archType == ArchInfo::kTypeX86) ? 8 : 16;
     119             : 
     120             :   // Initialize WorkData::archRegs.
     121           0 :   _workData[X86Reg::kKindGp ].archRegs = Utils::bits(count) & ~Utils::mask(X86Gp::kIdSp);
     122           0 :   _workData[X86Reg::kKindMm ].archRegs = Utils::bits(8);
     123           0 :   _workData[X86Reg::kKindK  ].archRegs = Utils::bits(8);
     124           0 :   _workData[X86Reg::kKindVec].archRegs = Utils::bits(count);
     125             : 
     126           0 :   if (preservedFP)
     127           0 :     _workData[X86Reg::kKindGp].archRegs &= ~Utils::mask(X86Gp::kIdBp);
     128             : 
     129             :   // Initialize WorkData::workRegs.
     130           0 :   for (i = 0; i < kMaxVRegKinds; i++)
     131           0 :     _workData[i].workRegs = _workData[i].archRegs & (dirtyRegs[i] | ~func.getCallConv().getPreservedRegs(i));
     132             : 
     133             :   // Build WorkData.
     134           0 :   for (i = 0; i < kFuncArgCountLoHi; i++) {
     135           0 :     const DstArg& dstArg = args.getArg(i);
     136           0 :     if (!dstArg.isAssigned()) continue;
     137             : 
     138             :     const SrcArg& srcArg = func.getArg(i);
     139           0 :     if (ASMJIT_UNLIKELY(!srcArg.isAssigned()))
     140             :       return DebugUtils::errored(kErrorInvalidState);
     141             : 
     142             :     uint32_t dstRegType = dstArg.getRegType();
     143           0 :     if (ASMJIT_UNLIKELY(dstRegType >= X86Reg::kRegCount))
     144             :       return DebugUtils::errored(kErrorInvalidRegType);
     145             : 
     146             :     uint32_t dstRegKind = X86Reg::kindOf(dstRegType);
     147           0 :     if (ASMJIT_UNLIKELY(dstRegKind >= kMaxVRegKinds))
     148             :       return DebugUtils::errored(kErrorInvalidState);
     149             : 
     150             :     WorkData& dstData = _workData[dstRegKind];
     151             :     uint32_t dstRegId = dstArg.getRegId();
     152           0 :     if (ASMJIT_UNLIKELY(dstRegId >= 32 || !(dstData.archRegs & Utils::mask(dstRegId))))
     153             :       return DebugUtils::errored(kErrorInvalidPhysId);
     154             : 
     155             :     uint32_t dstRegMask = Utils::mask(dstRegId);
     156           0 :     if (ASMJIT_UNLIKELY(dstData.usedRegs & dstRegMask))
     157             :       return DebugUtils::errored(kErrorOverlappedRegs);
     158             : 
     159           0 :     dstData.usedRegs |= dstRegMask;
     160           0 :     dstData.argIndex[dstRegId] = static_cast<uint8_t>(i);
     161             : 
     162           0 :     if (srcArg.byReg()) {
     163             :       uint32_t srcRegKind = X86Reg::kindOf(srcArg.getRegType());
     164             :       uint32_t srcRegId = srcArg.getRegId();
     165             :       uint32_t srcRegMask = Utils::mask(srcRegId);
     166             : 
     167           0 :       if (dstRegKind == srcRegKind) {
     168             :         // The best case, register is allocated where it is expected to be.
     169           0 :         if (dstRegId == srcRegId) continue;
     170             : 
     171             :         // Detect a register swap.
     172           0 :         if (dstData.usedRegs & srcRegMask) {
     173           0 :           const SrcArg& ref = func.getArg(dstData.argIndex[srcRegId]);
     174           0 :           if (ref.byReg() && X86Reg::kindOf(ref.getRegType()) == dstRegKind && ref.getRegId() == dstRegId) {
     175           0 :             dstData.numSwaps++;
     176           0 :             _hasRegSwaps = true;
     177             :           }
     178             :         }
     179           0 :         dstData.srcRegs |= srcRegMask;
     180             :       }
     181             :       else {
     182           0 :         if (ASMJIT_UNLIKELY(srcRegKind >= kMaxVRegKinds))
     183             :           return DebugUtils::errored(kErrorInvalidState);
     184             : 
     185             :         WorkData& srcData = _workData[srcRegKind];
     186           0 :         srcData.srcRegs |= srcRegMask;
     187             :       }
     188             :     }
     189             :     else {
     190           0 :       dstData.numStackArgs++;
     191           0 :       _hasStackArgs = true;
     192             :     }
     193             : 
     194           0 :     dstData.numOps++;
     195           0 :     dstData.dstRegs |= dstRegMask;
     196             :   }
     197             : 
     198             :   return kErrorOk;
     199             : }
     200             : 
     201           0 : ASMJIT_FAVOR_SIZE Error X86FuncArgsContext::markDstRegsDirty(FuncFrameInfo& ffi) noexcept {
     202           0 :   for (uint32_t i = 0; i < kMaxVRegKinds; i++) {
     203             :     WorkData& wd = _workData[i];
     204           0 :     uint32_t regs = wd.usedRegs | wd.dstRegs;
     205             : 
     206           0 :     wd.workRegs |= regs;
     207             :     ffi.addDirtyRegs(i, regs);
     208             :   }
     209             : 
     210           0 :   return kErrorOk;
     211             : }
     212             : 
     213           0 : ASMJIT_FAVOR_SIZE Error X86FuncArgsContext::markRegsForSwaps(FuncFrameInfo& ffi) noexcept {
     214           0 :   if (!_hasRegSwaps)
     215             :     return kErrorOk;
     216             : 
     217             :   // If some registers require swapping then select one dirty register that
     218             :   // can be used as a temporary. We can do it also without it (by using xors),
     219             :   // but using temporary is always safer and also faster approach.
     220           0 :   for (uint32_t i = 0; i < kMaxVRegKinds; i++) {
     221             :     // Skip all register kinds where swapping is natively supported (GP regs).
     222           0 :     if (i == X86Reg::kKindGp) continue;
     223             : 
     224             :     // Skip all register kinds that don't require swapping.
     225             :     WorkData& wd = _workData[i];
     226           0 :     if (!wd.numSwaps) continue;
     227             : 
     228             :     // Initially, pick some clobbered or dirty register.
     229           0 :     uint32_t workRegs = wd.workRegs;
     230           0 :     uint32_t regs = workRegs & ~(wd.usedRegs | wd.dstRegs);
     231             : 
     232             :     // If that didn't work out pick some register which is not in 'used'.
     233           0 :     if (!regs) regs = workRegs & ~wd.usedRegs;
     234             : 
     235             :     // If that didn't work out pick any other register that is allocable.
     236             :     // This last resort case will, however, result in marking one more
     237             :     // register dirty.
     238           0 :     if (!regs) regs = wd.archRegs & ~workRegs;
     239             : 
     240             :     // If that didn't work out we will have to use xors instead of moves.
     241           0 :     if (!regs) continue;
     242             : 
     243             :     uint32_t regMask = Utils::mask(Utils::findFirstBit(regs));
     244           0 :     wd.workRegs |= regMask;
     245             :     ffi.addDirtyRegs(i, regMask);
     246             :   }
     247             : 
     248             :   return kErrorOk;
     249             : }
     250             : 
     251           0 : ASMJIT_FAVOR_SIZE Error X86FuncArgsContext::markStackArgsReg(FuncFrameInfo& ffi) noexcept {
     252           0 :   if (!_hasStackArgs)
     253             :     return kErrorOk;
     254             : 
     255             :   // Decide which register to use to hold the stack base address.
     256           0 :   if (!ffi.hasPreservedFP()) {
     257             :     WorkData& wd = _workData[X86Reg::kKindGp];
     258             :     uint32_t saRegId = ffi.getStackArgsRegId();
     259           0 :     uint32_t usedRegs = wd.usedRegs;
     260             : 
     261           0 :     if (saRegId != Globals::kInvalidRegId) {
     262             :       // Check if the user chosen SA register doesn't overlap with others.
     263             :       // However, it's fine if it overlaps with some 'dstMove' register.
     264           0 :       if (usedRegs & Utils::mask(saRegId))
     265             :         return DebugUtils::errored(kErrorOverlappingStackRegWithRegArg);
     266             :     }
     267             :     else {
     268             :       // Initially, pick some clobbered or dirty register that is neither
     269             :       // in 'used' and neither in 'dstMove'. That's the safest bet as the
     270             :       // register won't collide with anything right now.
     271           0 :       uint32_t regs = wd.workRegs & ~(usedRegs | wd.dstRegs);
     272             : 
     273             :       // If that didn't work out pick some register which is not in 'used'.
     274           0 :       if (!regs) regs = wd.workRegs & ~usedRegs;
     275             : 
     276             :       // If that didn't work out then we have to make one more register dirty.
     277           0 :       if (!regs) regs = wd.archRegs & ~wd.workRegs;
     278             : 
     279             :       // If that didn't work out we can't continue.
     280           0 :       if (ASMJIT_UNLIKELY(!regs))
     281             :         return DebugUtils::errored(kErrorNoMorePhysRegs);
     282             : 
     283             :       saRegId = Utils::findFirstBit(regs);
     284             :       ffi.setStackArgsRegId(saRegId);
     285             :     }
     286             :   }
     287             :   else {
     288             :     ffi.setStackArgsRegId(X86Gp::kIdBp);
     289             :   }
     290             : 
     291             :   return kErrorOk;
     292             : }
     293             : 
     294             : // ============================================================================
     295             : // [asmjit::X86Internal - CallConv]
     296             : // ============================================================================
     297             : 
     298       45233 : ASMJIT_FAVOR_SIZE Error X86Internal::initCallConv(CallConv& cc, uint32_t ccId) noexcept {
     299             :   const uint32_t kKindGp  = X86Reg::kKindGp;
     300             :   const uint32_t kKindVec = X86Reg::kKindVec;
     301             :   const uint32_t kKindMm  = X86Reg::kKindMm;
     302             :   const uint32_t kKindK   = X86Reg::kKindK;
     303             : 
     304             :   const uint32_t kZax = X86Gp::kIdAx;
     305             :   const uint32_t kZbx = X86Gp::kIdBx;
     306             :   const uint32_t kZcx = X86Gp::kIdCx;
     307             :   const uint32_t kZdx = X86Gp::kIdDx;
     308             :   const uint32_t kZsp = X86Gp::kIdSp;
     309             :   const uint32_t kZbp = X86Gp::kIdBp;
     310             :   const uint32_t kZsi = X86Gp::kIdSi;
     311             :   const uint32_t kZdi = X86Gp::kIdDi;
     312             : 
     313       45233 :   switch (ccId) {
     314             :     case CallConv::kIdX86StdCall:
     315             :       cc.setFlags(CallConv::kFlagCalleePopsStack);
     316           0 :       goto X86CallConv;
     317             : 
     318             :     case CallConv::kIdX86MsThisCall:
     319             :       cc.setFlags(CallConv::kFlagCalleePopsStack);
     320             :       cc.setPassedOrder(kKindGp, kZcx);
     321           0 :       goto X86CallConv;
     322             : 
     323             :     case CallConv::kIdX86MsFastCall:
     324             :     case CallConv::kIdX86GccFastCall:
     325             :       cc.setFlags(CallConv::kFlagCalleePopsStack);
     326             :       cc.setPassedOrder(kKindGp, kZcx, kZdx);
     327           0 :       goto X86CallConv;
     328             : 
     329             :     case CallConv::kIdX86GccRegParm1:
     330             :       cc.setPassedOrder(kKindGp, kZax);
     331           0 :       goto X86CallConv;
     332             : 
     333             :     case CallConv::kIdX86GccRegParm2:
     334             :       cc.setPassedOrder(kKindGp, kZax, kZdx);
     335           0 :       goto X86CallConv;
     336             : 
     337             :     case CallConv::kIdX86GccRegParm3:
     338             :       cc.setPassedOrder(kKindGp, kZax, kZdx, kZcx);
     339           0 :       goto X86CallConv;
     340             : 
     341             :     case CallConv::kIdX86CDecl:
     342           0 : X86CallConv:
     343             :       cc.setNaturalStackAlignment(4);
     344             :       cc.setArchType(ArchInfo::kTypeX86);
     345             :       cc.setPreservedRegs(kKindGp, Utils::mask(kZbx, kZsp, kZbp, kZsi, kZdi));
     346             :       break;
     347             : 
     348             :     case CallConv::kIdX86Win64:
     349             :       cc.setArchType(ArchInfo::kTypeX64);
     350             :       cc.setAlgorithm(CallConv::kAlgorithmWin64);
     351             :       cc.setFlags(CallConv::kFlagPassFloatsByVec | CallConv::kFlagIndirectVecArgs);
     352             :       cc.setNaturalStackAlignment(16);
     353             :       cc.setSpillZoneSize(32);
     354             :       cc.setPassedOrder(kKindGp, kZcx, kZdx, 8, 9);
     355             :       cc.setPassedOrder(kKindVec, 0, 1, 2, 3);
     356             :       cc.setPreservedRegs(kKindGp, Utils::mask(kZbx, kZsp, kZbp, kZsi, kZdi, 12, 13, 14, 15));
     357             :       cc.setPreservedRegs(kKindVec, Utils::mask(6, 7, 8, 9, 10, 11, 12, 13, 14, 15));
     358             :       break;
     359             : 
     360             :     case CallConv::kIdX86SysV64:
     361             :       cc.setArchType(ArchInfo::kTypeX64);
     362             :       cc.setFlags(CallConv::kFlagPassFloatsByVec);
     363             :       cc.setNaturalStackAlignment(16);
     364             :       cc.setRedZoneSize(128);
     365             :       cc.setPassedOrder(kKindGp, kZdi, kZsi, kZdx, kZcx, 8, 9);
     366             :       cc.setPassedOrder(kKindVec, 0, 1, 2, 3, 4, 5, 6, 7);
     367             :       cc.setPreservedRegs(kKindGp, Utils::mask(kZbx, kZsp, kZbp, 12, 13, 14, 15));
     368             :       break;
     369             : 
     370           0 :     case CallConv::kIdX86FastEval2:
     371             :     case CallConv::kIdX86FastEval3:
     372             :     case CallConv::kIdX86FastEval4: {
     373           0 :       uint32_t n = ccId - CallConv::kIdX86FastEval2;
     374             : 
     375             :       cc.setArchType(ArchInfo::kTypeX86);
     376             :       cc.setFlags(CallConv::kFlagPassFloatsByVec);
     377             :       cc.setNaturalStackAlignment(16);
     378             :       cc.setPassedOrder(kKindGp, kZax, kZdx, kZcx, kZsi, kZdi);
     379             :       cc.setPassedOrder(kKindMm, 0, 1, 2, 3, 4, 5, 6, 7);
     380             :       cc.setPassedOrder(kKindVec, 0, 1, 2, 3, 4, 5, 6, 7);
     381             : 
     382             :       cc.setPreservedRegs(kKindGp , Utils::bits(8));
     383           0 :       cc.setPreservedRegs(kKindVec, Utils::bits(8) & ~Utils::bits(n));
     384             :       cc.setPreservedRegs(kKindMm , Utils::bits(8));
     385             :       cc.setPreservedRegs(kKindK  , Utils::bits(8));
     386             :       break;
     387             :     }
     388             : 
     389           0 :     case CallConv::kIdX64FastEval2:
     390             :     case CallConv::kIdX64FastEval3:
     391             :     case CallConv::kIdX64FastEval4: {
     392           0 :       uint32_t n = ccId - CallConv::kIdX64FastEval2;
     393             : 
     394             :       cc.setArchType(ArchInfo::kTypeX64);
     395             :       cc.setFlags(CallConv::kFlagPassFloatsByVec);
     396             :       cc.setNaturalStackAlignment(16);
     397             :       cc.setPassedOrder(kKindGp, kZax, kZdx, kZcx, kZsi, kZdi);
     398             :       cc.setPassedOrder(kKindMm, 0, 1, 2, 3, 4, 5, 6, 7);
     399             :       cc.setPassedOrder(kKindVec, 0, 1, 2, 3, 4, 5, 6, 7);
     400             : 
     401             :       cc.setPreservedRegs(kKindGp , Utils::bits(16));
     402           0 :       cc.setPreservedRegs(kKindVec,~Utils::bits(n));
     403             :       cc.setPreservedRegs(kKindMm , Utils::bits(8));
     404             :       cc.setPreservedRegs(kKindK  , Utils::bits(8));
     405             :       break;
     406             :     }
     407             : 
     408             :     default:
     409             :       return DebugUtils::errored(kErrorInvalidArgument);
     410             :   }
     411             : 
     412             :   cc.setId(ccId);
     413       45233 :   return kErrorOk;
     414             : }
     415             : 
     416             : // ============================================================================
     417             : // [asmjit::X86Internal - FuncDetail]
     418             : // ============================================================================
     419             : 
     420       45233 : ASMJIT_FAVOR_SIZE Error X86Internal::initFuncDetail(FuncDetail& func, const FuncSignature& sign, uint32_t gpSize) noexcept {
     421             :   const CallConv& cc = func.getCallConv();
     422             :   uint32_t archType = cc.getArchType();
     423             : 
     424             :   uint32_t i;
     425             :   uint32_t argCount = func.getArgCount();
     426             : 
     427       45233 :   if (func.getRetCount() != 0) {
     428             :     uint32_t typeId = func._rets[0].getTypeId();
     429       45233 :     switch (typeId) {
     430           0 :       case TypeId::kI64:
     431             :       case TypeId::kU64: {
     432           0 :         if (archType == ArchInfo::kTypeX86) {
     433             :           // Convert a 64-bit return to two 32-bit returns.
     434           0 :           func._retCount = 2;
     435           0 :           typeId -= 2;
     436             : 
     437             :           // 64-bit value is returned in EDX:EAX on X86.
     438             :           func._rets[0].initReg(typeId, X86Gp::kRegGpd, X86Gp::kIdAx);
     439             :           func._rets[1].initReg(typeId, X86Gp::kRegGpd, X86Gp::kIdDx);
     440             :           break;
     441             :         }
     442             :         else {
     443             :           func._rets[0].initReg(typeId, X86Gp::kRegGpq, X86Gp::kIdAx);
     444             :         }
     445             :         break;
     446             :       }
     447             : 
     448           0 :       case TypeId::kI8:
     449             :       case TypeId::kU8:
     450             :       case TypeId::kI16:
     451             :       case TypeId::kU16:
     452             :       case TypeId::kI32:
     453             :       case TypeId::kU32: {
     454             :         func._rets[0].assignToReg(X86Gp::kRegGpd, X86Gp::kIdAx);
     455             :         break;
     456             :       }
     457             : 
     458       45233 :       case TypeId::kF32:
     459             :       case TypeId::kF64: {
     460       45233 :         uint32_t regType = (archType == ArchInfo::kTypeX86) ? X86Reg::kRegFp : X86Reg::kRegXmm;
     461             :         func._rets[0].assignToReg(regType, 0);
     462             :         break;
     463             :       }
     464             : 
     465           0 :       case TypeId::kF80: {
     466             :         // 80-bit floats are always returned by FP0.
     467             :         func._rets[0].assignToReg(X86Reg::kRegFp, 0);
     468             :         break;
     469             :       }
     470             : 
     471           0 :       case TypeId::kMmx32:
     472             :       case TypeId::kMmx64: {
     473             :         // On X64 MM register(s) are returned through XMM or GPQ (Win64).
     474             :         uint32_t regType = X86Reg::kRegMm;
     475           0 :         if (archType != ArchInfo::kTypeX86)
     476           0 :           regType = cc.getAlgorithm() == CallConv::kAlgorithmDefault ? X86Reg::kRegXmm : X86Reg::kRegGpq;
     477             : 
     478             :         func._rets[0].assignToReg(regType, 0);
     479             :         break;
     480             :       }
     481             : 
     482           0 :       default: {
     483             :         func._rets[0].assignToReg(x86VecTypeIdToRegType(typeId), 0);
     484             :         break;
     485             :       }
     486             :     }
     487             :   }
     488             : 
     489       45233 :   uint32_t stackBase = gpSize;
     490       45233 :   uint32_t stackOffset = stackBase + cc._spillZoneSize;
     491             : 
     492       45233 :   if (cc.getAlgorithm() == CallConv::kAlgorithmDefault) {
     493             :     uint32_t gpzPos = 0;
     494             :     uint32_t vecPos = 0;
     495             : 
     496       58873 :     for (i = 0; i < argCount; i++) {
     497             :       FuncDetail::Value& arg = func._args[i];
     498             :       uint32_t typeId = arg.getTypeId();
     499             : 
     500       13640 :       if (TypeId::isInt(typeId)) {
     501         812 :         uint32_t regId = gpzPos < CallConv::kNumRegArgsPerKind ? cc._passedOrder[X86Reg::kKindGp].id[gpzPos] : Globals::kInvalidRegId;
     502         812 :         if (regId != Globals::kInvalidRegId) {
     503             :           uint32_t regType = (typeId <= TypeId::kU32)
     504         812 :             ? X86Reg::kRegGpd
     505             :             : X86Reg::kRegGpq;
     506             :           arg.assignToReg(regType, regId);
     507             :           func.addUsedRegs(X86Reg::kKindGp, Utils::mask(regId));
     508         812 :           gpzPos++;
     509             :         }
     510             :         else {
     511           0 :           uint32_t size = std::max<uint32_t>(TypeId::sizeOf(typeId), gpSize);
     512             :           arg.assignToStack(stackOffset);
     513           0 :           stackOffset += size;
     514             :         }
     515         812 :         continue;
     516         812 :       }
     517             : 
     518       12828 :       if (TypeId::isFloat(typeId) || TypeId::isVec(typeId)) {
     519       12828 :         uint32_t regId = vecPos < CallConv::kNumRegArgsPerKind ? cc._passedOrder[X86Reg::kKindVec].id[vecPos] : Globals::kInvalidRegId;
     520             : 
     521             :         // If this is a float, but `floatByVec` is false, we have to pass by stack.
     522       12828 :         if (TypeId::isFloat(typeId) && !cc.hasFlag(CallConv::kFlagPassFloatsByVec))
     523             :           regId = Globals::kInvalidRegId;
     524             : 
     525       12828 :         if (regId != Globals::kInvalidRegId) {
     526             :           arg.initReg(typeId, x86VecTypeIdToRegType(typeId), regId);
     527             :           func.addUsedRegs(X86Reg::kKindVec, Utils::mask(regId));
     528       12828 :           vecPos++;
     529             :         }
     530             :         else {
     531             :           int32_t size = TypeId::sizeOf(typeId);
     532             :           arg.assignToStack(stackOffset);
     533           0 :           stackOffset += size;
     534             :         }
     535       12828 :         continue;
     536       12828 :       }
     537             :     }
     538             :   }
     539             : 
     540       45233 :   if (cc.getAlgorithm() == CallConv::kAlgorithmWin64) {
     541           0 :     for (i = 0; i < argCount; i++) {
     542             :       FuncDetail::Value& arg = func._args[i];
     543             : 
     544             :       uint32_t typeId = arg.getTypeId();
     545             :       uint32_t size = TypeId::sizeOf(typeId);
     546             : 
     547           0 :       if (TypeId::isInt(typeId) || TypeId::isMmx(typeId)) {
     548           0 :         uint32_t regId = i < CallConv::kNumRegArgsPerKind ? cc._passedOrder[X86Reg::kKindGp].id[i] : Globals::kInvalidRegId;
     549           0 :         if (regId != Globals::kInvalidRegId) {
     550           0 :           uint32_t regType = (size <= 4 && !TypeId::isMmx(typeId))
     551           0 :             ? X86Reg::kRegGpd
     552             :             : X86Reg::kRegGpq;
     553             : 
     554             :           arg.assignToReg(regType, regId);
     555             :           func.addUsedRegs(X86Reg::kKindGp, Utils::mask(regId));
     556             :         }
     557             :         else {
     558             :           arg.assignToStack(stackOffset);
     559           0 :           stackOffset += gpSize;
     560             :         }
     561           0 :         continue;
     562           0 :       }
     563             : 
     564           0 :       if (TypeId::isFloat(typeId) || TypeId::isVec(typeId)) {
     565           0 :         uint32_t regId = i < CallConv::kNumRegArgsPerKind ? cc._passedOrder[X86Reg::kKindVec].id[i] : Globals::kInvalidRegId;
     566           0 :         if (regId != Globals::kInvalidRegId && (TypeId::isFloat(typeId) || cc.hasFlag(CallConv::kFlagVectorCall))) {
     567             :           uint32_t regType = x86VecTypeIdToRegType(typeId);
     568             :           uint32_t regId = cc._passedOrder[X86Reg::kKindVec].id[i];
     569             : 
     570             :           arg.assignToReg(regType, regId);
     571             :           func.addUsedRegs(X86Reg::kKindVec, Utils::mask(regId));
     572             :         }
     573             :         else {
     574             :           arg.assignToStack(stackOffset);
     575           0 :           stackOffset += 8; // Always 8 bytes (float/double).
     576             :         }
     577           0 :         continue;
     578           0 :       }
     579             :     }
     580             :   }
     581             : 
     582       45233 :   func._argStackSize = stackOffset - stackBase;
     583       45233 :   return kErrorOk;
     584             : }
     585             : 
     586             : // ============================================================================
     587             : // [asmjit::X86Internal - FrameLayout]
     588             : // ============================================================================
     589             : 
     590       32111 : ASMJIT_FAVOR_SIZE Error X86Internal::initFrameLayout(FuncFrameLayout& layout, const FuncDetail& func, const FuncFrameInfo& ffi) noexcept {
     591             :   layout.reset();
     592             : 
     593             :   uint32_t kind;
     594       32111 :   uint32_t gpSize = (func.getCallConv().getArchType() == ArchInfo::kTypeX86) ? 4 : 8;
     595             : 
     596             :   // Calculate a bit-mask of all registers that must be saved & restored.
     597      160555 :   for (kind = 0; kind < Globals::kMaxVRegKinds; kind++)
     598      128444 :     layout._savedRegs[kind] = (ffi.getDirtyRegs(kind) & ~func.getPassedRegs(kind)) & func.getPreservedRegs(kind);
     599             : 
     600             :   // Include EBP|RBP if the function preserves the frame-pointer.
     601       32111 :   if (ffi.hasPreservedFP()) {
     602           0 :     layout._preservedFP = true;
     603           0 :     layout._savedRegs[X86Reg::kKindGp] |= Utils::mask(X86Gp::kIdBp);
     604             :   }
     605             : 
     606             :   // Exclude ESP/RSP - this register is never included in saved-regs.
     607       32111 :   layout._savedRegs[X86Reg::kKindGp] &= ~Utils::mask(X86Gp::kIdSp);
     608             : 
     609             :   // Calculate the final stack alignment.
     610             :   uint32_t stackAlignment =
     611             :     std::max<uint32_t>(
     612             :       std::max<uint32_t>(
     613       32111 :         ffi.getStackFrameAlignment(),
     614       32111 :         ffi.getCallFrameAlignment()),
     615       32111 :       func.getCallConv().getNaturalStackAlignment());
     616       32111 :   layout._stackAlignment = static_cast<uint8_t>(stackAlignment);
     617             : 
     618             :   // Calculate if dynamic stack alignment is required. If true the function has
     619             :   // to align stack dynamically to match `_stackAlignment` and would require to
     620             :   // access its stack-based arguments through `_stackArgsRegId`.
     621       32111 :   bool dsa = stackAlignment > func.getCallConv().getNaturalStackAlignment() && stackAlignment >= 16;
     622           0 :   layout._dynamicAlignment = dsa;
     623             : 
     624             :   // This flag describes if the prolog inserter must store the previous ESP|RSP
     625             :   // to stack so the epilog inserter can load the stack from it before returning.
     626           0 :   bool dsaSlotUsed = dsa && !ffi.hasPreservedFP();
     627       32111 :   layout._dsaSlotUsed = dsaSlotUsed;
     628             : 
     629             :   // These two are identical if the function doesn't align its stack dynamically.
     630             :   uint32_t stackArgsRegId = ffi.getStackArgsRegId();
     631       32111 :   if (stackArgsRegId == Globals::kInvalidRegId)
     632             :     stackArgsRegId = X86Gp::kIdSp;
     633             : 
     634             :   // Fix stack arguments base-register from ESP|RSP to EBP|RBP in case it was
     635             :   // not picked before and the function performs dynamic stack alignment.
     636       32111 :   if (dsa && stackArgsRegId == X86Gp::kIdSp)
     637             :     stackArgsRegId = X86Gp::kIdBp;
     638             : 
     639       32111 :   if (stackArgsRegId != X86Gp::kIdSp)
     640           0 :     layout._savedRegs[X86Reg::kKindGp] |= Utils::mask(stackArgsRegId) & func.getPreservedRegs(X86Gp::kKindGp);
     641             : 
     642       32111 :   layout._stackBaseRegId = X86Gp::kIdSp;
     643       32111 :   layout._stackArgsRegId = static_cast<uint8_t>(stackArgsRegId);
     644             : 
     645             :   // Setup stack size used to save preserved registers.
     646       32111 :   layout._gpStackSize  = Utils::bitCount(layout.getSavedRegs(X86Reg::kKindGp )) * gpSize;
     647       32111 :   layout._vecStackSize = Utils::bitCount(layout.getSavedRegs(X86Reg::kKindVec)) * 16 +
     648             :                          Utils::bitCount(layout.getSavedRegs(X86Reg::kKindMm )) *  8 ;
     649             : 
     650             :   uint32_t v = 0;                        // The beginning of the stack frame, aligned to CallFrame alignment.
     651       32111 :   v += ffi._callFrameSize;               // Count '_callFrameSize'  <- This is used to call functions.
     652             :   v  = Utils::alignTo(v, stackAlignment);// Align to function's SA
     653             : 
     654       32111 :   layout._stackBaseOffset = v;           // Store '_stackBaseOffset'<- Function's own stack starts here..
     655       32111 :   v += ffi._stackFrameSize;              // Count '_stackFrameSize' <- Function's own stack ends here.
     656             : 
     657             :   // If the function is aligned, calculate the alignment necessary to store
     658             :   // vector registers, and set `FuncFrameInfo::kX86FlagAlignedVecSR` to inform
     659             :   // PrologEpilog inserter that it can use instructions to perform aligned
     660             :   // stores/loads to save/restore VEC registers.
     661       32111 :   if (stackAlignment >= 16 && layout._vecStackSize) {
     662             :     v = Utils::alignTo(v, 16);           // Align '_vecStackOffset'.
     663           0 :     layout._alignedVecSR = true;
     664             :   }
     665             : 
     666       32111 :   layout._vecStackOffset = v;            // Store '_vecStackOffset' <- Functions VEC Save|Restore starts here.
     667       32111 :   v += layout._vecStackSize;             // Count '_vecStackSize'   <- Functions VEC Save|Restore ends here.
     668             : 
     669       32111 :   if (dsaSlotUsed) {
     670           0 :     layout._dsaSlot = v;                 // Store '_dsaSlot'        <- Old stack pointer is stored here.
     671           0 :     v += gpSize;
     672             :   }
     673             : 
     674             :   // The return address should be stored after GP save/restore regs. It has
     675             :   // the same size as `gpSize` (basically the native register/pointer size).
     676             :   // We don't adjust it now as `v` now contains the exact size that the
     677             :   // function requires to adjust (call frame + stack frame, vec stack size).
     678             :   // The stack (if we consider this size) is misaligned now, as it's always
     679             :   // aligned before the function call - when `call()` is executed it pushes
     680             :   // the current EIP|RIP onto the stack, and misaligns it by 12 or 8 bytes
     681             :   // (depending on the architecture). So count number of bytes needed to align
     682             :   // it up to the function's CallFrame (the beginning).
     683       32111 :   if (v || ffi.hasCalls())
     684        7106 :     v += Utils::alignDiff(v + layout._gpStackSize + gpSize, stackAlignment);
     685             : 
     686       32111 :   layout._stackAdjustment = v;           // Store '_stackAdjustment'<- SA used by 'add zsp, SA' and 'sub zsp, SA'.
     687       32111 :   layout._gpStackOffset = v;             // Store '_gpStackOffset'  <- Functions GP Save|Restore starts here.
     688       32111 :   v += layout._gpStackSize;              // Count '_gpStackSize'    <- Functions GP Save|Restore ends here.
     689             : 
     690       32111 :   v += gpSize;                           // Count 'ReturnAddress'.
     691       32111 :   v += func.getSpillZoneSize();          // Count 'SpillZoneSize'.
     692             : 
     693             :   // Calculate where function arguments start, relative to the stackArgsRegId.
     694             :   // If the register that will be used to access arguments passed by stack is
     695             :   // ESP|RSP then it's exactly where we are now, otherwise we must calculate
     696             :   // how many 'push regs' we did and adjust it based on that.
     697             :   uint32_t stackArgsOffset = v;
     698       32111 :   if (stackArgsRegId != X86Gp::kIdSp) {
     699           0 :     if (ffi.hasPreservedFP())
     700             :       stackArgsOffset = gpSize;
     701             :     else
     702             :       stackArgsOffset = layout._gpStackSize;
     703             :   }
     704       32111 :   layout._stackArgsOffset = stackArgsOffset;
     705             : 
     706             :   // If the function does dynamic stack adjustment then the stack-adjustment
     707             :   // must be aligned.
     708       32111 :   if (dsa)
     709           0 :     layout._stackAdjustment = Utils::alignTo(layout._stackAdjustment, stackAlignment);
     710             : 
     711             :   // Initialize variables based on CallConv flags.
     712       32111 :   if (func.hasFlag(CallConv::kFlagCalleePopsStack))
     713           0 :     layout._calleeStackCleanup = static_cast<uint16_t>(func.getArgStackSize());
     714             : 
     715             :   // Initialize variables based on FFI flags.
     716       32111 :   layout._mmxCleanup = ffi.hasMmxCleanup();
     717       32111 :   layout._avxEnabled = ffi.isAvxEnabled();
     718       32111 :   layout._avxCleanup = ffi.hasAvxCleanup();
     719             : 
     720       32111 :   return kErrorOk;
     721             : }
     722             : 
     723             : // ============================================================================
     724             : // [asmjit::X86Internal - ArgsToFrameInfo]
     725             : // ============================================================================
     726             : 
     727           0 : ASMJIT_FAVOR_SIZE Error X86Internal::argsToFrameInfo(const FuncArgsMapper& args, FuncFrameInfo& ffi) noexcept {
     728           0 :   X86FuncArgsContext ctx;
     729           0 :   ASMJIT_PROPAGATE(ctx.initWorkData(args, ffi._dirtyRegs, ffi.hasPreservedFP()));
     730             : 
     731           0 :   ASMJIT_PROPAGATE(ctx.markDstRegsDirty(ffi));
     732           0 :   ASMJIT_PROPAGATE(ctx.markRegsForSwaps(ffi));
     733           0 :   ASMJIT_PROPAGATE(ctx.markStackArgsReg(ffi));
     734             :   return kErrorOk;
     735             : }
     736             : 
     737             : // ============================================================================
     738             : // [asmjit::X86Internal - Emit Helpers]
     739             : // ============================================================================
     740             : 
     741      106936 : ASMJIT_FAVOR_SIZE Error X86Internal::emitRegMove(X86Emitter* emitter,
     742             :   const Operand_& dst_,
     743             :   const Operand_& src_, uint32_t typeId, bool avxEnabled, const char* comment) {
     744             : 
     745             :   // Invalid or abstract TypeIds are not allowed.
     746             :   ASMJIT_ASSERT(TypeId::isValid(typeId) && !TypeId::isAbstract(typeId));
     747             : 
     748             :   Operand dst(dst_);
     749             :   Operand src(src_);
     750             : 
     751             :   uint32_t instId = Inst::kIdNone;
     752             :   uint32_t memFlags = 0;
     753             : 
     754             :   enum MemFlags {
     755             :     kDstMem = 0x1,
     756             :     kSrcMem = 0x2
     757             :   };
     758             : 
     759             :   // Detect memory operands and patch them to have the same size as the register.
     760             :   // CodeCompiler always sets memory size of allocs and spills, so it shouldn't
     761             :   // be really necessary, however, after this function was separated from Compiler
     762             :   // it's better to make sure that the size is always specified, as we can use
     763             :   // 'movzx' and 'movsx' that rely on it.
     764      106936 :   if (dst.isMem()) { memFlags |= kDstMem; dst.as<X86Mem>().setSize(src.getSize()); }
     765      106936 :   if (src.isMem()) { memFlags |= kSrcMem; src.as<X86Mem>().setSize(dst.getSize()); }
     766             : 
     767      106936 :   switch (typeId) {
     768           0 :     case TypeId::kI8:
     769             :     case TypeId::kU8:
     770             :     case TypeId::kI16:
     771             :     case TypeId::kU16:
     772             :       // Special case - 'movzx' load.
     773           0 :       if (memFlags & kSrcMem) {
     774             :         instId = X86Inst::kIdMovzx;
     775             :         dst.setSignature(X86RegTraits<X86Reg::kRegGpd>::kSignature);
     776             :       }
     777           0 :       else if (!memFlags) {
     778             :         // Change both destination and source registers to GPD (safer, no dependencies).
     779             :         dst.setSignature(X86RegTraits<X86Reg::kRegGpd>::kSignature);
     780             :         src.setSignature(X86RegTraits<X86Reg::kRegGpd>::kSignature);
     781             :       }
     782             :       ASMJIT_FALLTHROUGH;
     783             : 
     784             :     case TypeId::kI32:
     785             :     case TypeId::kU32:
     786             :     case TypeId::kI64:
     787             :     case TypeId::kU64:
     788             :       instId = X86Inst::kIdMov;
     789             :       break;
     790             : 
     791           0 :     case TypeId::kMmx32:
     792             :       instId = X86Inst::kIdMovd;
     793           0 :       if (memFlags) break;
     794             :       ASMJIT_FALLTHROUGH;
     795           0 :     case TypeId::kMmx64 : instId = X86Inst::kIdMovq ; break;
     796             :     case TypeId::kMask8 : instId = X86Inst::kIdKmovb; break;
     797           0 :     case TypeId::kMask16: instId = X86Inst::kIdKmovw; break;
     798           0 :     case TypeId::kMask32: instId = X86Inst::kIdKmovd; break;
     799           0 :     case TypeId::kMask64: instId = X86Inst::kIdKmovq; break;
     800             : 
     801             :     default: {
     802             :       uint32_t elementTypeId = TypeId::elementOf(typeId);
     803      106936 :       if (TypeId::isVec32(typeId) && memFlags) {
     804           0 :         if (elementTypeId == TypeId::kF32)
     805           0 :           instId = avxEnabled ? X86Inst::kIdVmovss : X86Inst::kIdMovss;
     806             :         else
     807           0 :           instId = avxEnabled ? X86Inst::kIdVmovd : X86Inst::kIdMovd;
     808             :         break;
     809             :       }
     810             : 
     811      106936 :       if (TypeId::isVec64(typeId) && memFlags) {
     812       92292 :         if (elementTypeId == TypeId::kF64)
     813       92292 :           instId = avxEnabled ? X86Inst::kIdVmovsd : X86Inst::kIdMovsd;
     814             :         else
     815           0 :           instId = avxEnabled ? X86Inst::kIdVmovq : X86Inst::kIdMovq;
     816             :         break;
     817             :       }
     818             : 
     819       14644 :       if (elementTypeId == TypeId::kF32)
     820           0 :         instId = avxEnabled ? X86Inst::kIdVmovaps : X86Inst::kIdMovaps;
     821       14644 :       else if (elementTypeId == TypeId::kF64)
     822       14644 :         instId = avxEnabled ? X86Inst::kIdVmovapd : X86Inst::kIdMovapd;
     823           0 :       else if (typeId <= TypeId::_kVec256End)
     824           0 :         instId = avxEnabled ? X86Inst::kIdVmovdqa : X86Inst::kIdMovdqa;
     825           0 :       else if (elementTypeId <= TypeId::kU32)
     826             :         instId = X86Inst::kIdVmovdqa32;
     827             :       else
     828             :         instId = X86Inst::kIdVmovdqa64;
     829             :       break;
     830             :     }
     831             :   }
     832             : 
     833             :   if (!instId)
     834             :     return DebugUtils::errored(kErrorInvalidState);
     835             : 
     836             :   emitter->setInlineComment(comment);
     837      106936 :   return emitter->emit(instId, dst, src);
     838             : }
     839             : 
     840           0 : ASMJIT_FAVOR_SIZE Error X86Internal::emitArgMove(X86Emitter* emitter,
     841             :   const X86Reg& dst_, uint32_t dstTypeId,
     842             :   const Operand_& src_, uint32_t srcTypeId, bool avxEnabled, const char* comment) {
     843             : 
     844             :   // Deduce optional `dstTypeId`, which may be `TypeId::kVoid` in some cases.
     845           0 :   if (!dstTypeId) dstTypeId = x86OpData.archRegs.regTypeToTypeId[dst_.getType()];
     846             : 
     847             :   // Invalid or abstract TypeIds are not allowed.
     848             :   ASMJIT_ASSERT(TypeId::isValid(dstTypeId) && !TypeId::isAbstract(dstTypeId));
     849             :   ASMJIT_ASSERT(TypeId::isValid(srcTypeId) && !TypeId::isAbstract(srcTypeId));
     850             : 
     851             :   X86Reg dst(dst_);
     852             :   Operand src(src_);
     853             : 
     854           0 :   uint32_t dstSize = TypeId::sizeOf(dstTypeId);
     855           0 :   uint32_t srcSize = TypeId::sizeOf(srcTypeId);
     856             : 
     857             :   int32_t instId = Inst::kIdNone;
     858             : 
     859             :   // Not a real loop, just 'break' is nicer than 'goto'.
     860             :   for (;;) {
     861           0 :     if (TypeId::isInt(dstTypeId)) {
     862           0 :       if (TypeId::isInt(srcTypeId)) {
     863             :         instId = X86Inst::kIdMovsx;
     864           0 :         uint32_t typeOp = (dstTypeId << 8) | srcTypeId;
     865             : 
     866             :         // Sign extend by using 'movsx'.
     867           0 :         if (typeOp == ((TypeId::kI16 << 8) | TypeId::kI8 ) ||
     868           0 :             typeOp == ((TypeId::kI32 << 8) | TypeId::kI8 ) ||
     869           0 :             typeOp == ((TypeId::kI32 << 8) | TypeId::kI16) ||
     870           0 :             typeOp == ((TypeId::kI64 << 8) | TypeId::kI8 ) ||
     871             :             typeOp == ((TypeId::kI64 << 8) | TypeId::kI16)) break;
     872             : 
     873             :         // Sign extend by using 'movsxd'.
     874             :         instId = X86Inst::kIdMovsxd;
     875             :         if (typeOp == ((TypeId::kI64 << 8) | TypeId::kI32)) break;
     876             :       }
     877             : 
     878           0 :       if (TypeId::isInt(srcTypeId) || src_.isMem()) {
     879             :         // Zero extend by using 'movzx' or 'mov'.
     880           0 :         if (dstSize <= 4 && srcSize < 4) {
     881             :           instId = X86Inst::kIdMovzx;
     882             :           dst.setSignature(X86Reg::signatureOfT<X86Reg::kRegGpd>());
     883             :         }
     884             :         else {
     885             :           // We should have caught all possibilities where `srcSize` is less
     886             :           // than 4, so we don't have to worry about 'movzx' anymore. Minimum
     887             :           // size is enough to determine if we want 32-bit or 64-bit move.
     888             :           instId = X86Inst::kIdMov;
     889           0 :           srcSize = std::min(srcSize, dstSize);
     890             : 
     891           0 :           dst.setSignature(srcSize == 4 ? X86Reg::signatureOfT<X86Reg::kRegGpd>()
     892             :                                         : X86Reg::signatureOfT<X86Reg::kRegGpq>());
     893           0 :           if (src.isReg()) src.setSignature(dst.getSignature());
     894             :         }
     895             :         break;
     896             :       }
     897             : 
     898             :       // NOTE: The previous branch caught all memory sources, from here it's
     899             :       // always register to register conversion, so catch the remaining cases.
     900           0 :       srcSize = std::min(srcSize, dstSize);
     901             : 
     902           0 :       if (TypeId::isMmx(srcTypeId)) {
     903             :         // 64-bit move.
     904             :         instId = X86Inst::kIdMovq;
     905           0 :         if (srcSize == 8) break;
     906             : 
     907             :         // 32-bit move.
     908             :         instId = X86Inst::kIdMovd;
     909             :         dst.setSignature(X86Reg::signatureOfT<X86Reg::kRegGpd>());
     910             :         break;
     911             :       }
     912             : 
     913           0 :       if (TypeId::isMask(srcTypeId)) {
     914           0 :         instId = X86Inst::kmovIdFromSize(srcSize);
     915           0 :         dst.setSignature(srcSize <= 4 ? X86Reg::signatureOfT<X86Reg::kRegGpd>()
     916             :                                       : X86Reg::signatureOfT<X86Reg::kRegGpq>());
     917             :         break;
     918             :       }
     919             : 
     920           0 :       if (TypeId::isVec(srcTypeId)) {
     921             :         // 64-bit move.
     922           0 :         instId = avxEnabled ? X86Inst::kIdVmovq : X86Inst::kIdMovq;
     923           0 :         if (srcSize == 8) break;
     924             : 
     925             :         // 32-bit move.
     926           0 :         instId = avxEnabled ? X86Inst::kIdVmovd : X86Inst::kIdMovd;
     927             :         dst.setSignature(X86Reg::signatureOfT<X86Reg::kRegGpd>());
     928             :         break;
     929             :       }
     930             :     }
     931             : 
     932           0 :     if (TypeId::isMmx(dstTypeId)) {
     933             :       instId = X86Inst::kIdMovq;
     934           0 :       srcSize = std::min(srcSize, dstSize);
     935             : 
     936           0 :       if (TypeId::isInt(srcTypeId) || src.isMem()) {
     937             :         // 64-bit move.
     938           0 :         if (srcSize == 8) break;
     939             : 
     940             :         // 32-bit move.
     941             :         instId = X86Inst::kIdMovd;
     942           0 :         if (src.isReg()) src.setSignature(X86Reg::signatureOfT<X86Reg::kRegGpd>());
     943             :         break;
     944             :       }
     945             : 
     946           0 :       if (TypeId::isMmx(srcTypeId)) break;
     947             : 
     948             :       // NOTE: This will hurt if `avxEnabled`.
     949             :       instId = X86Inst::kIdMovdq2q;
     950           0 :       if (TypeId::isVec(srcTypeId)) break;
     951             :     }
     952             : 
     953           0 :     if (TypeId::isMask(dstTypeId)) {
     954           0 :       srcSize = std::min(srcSize, dstSize);
     955             : 
     956           0 :       if (TypeId::isInt(srcTypeId) || TypeId::isMask(srcTypeId) || src.isMem()) {
     957           0 :         instId = X86Inst::kmovIdFromSize(srcSize);
     958           0 :         if (X86Reg::isGp(src) && srcSize <= 4) src.setSignature(X86Reg::signatureOfT<X86Reg::kRegGpd>());
     959             :         break;
     960             :       }
     961             :     }
     962             : 
     963           0 :     if (TypeId::isVec(dstTypeId)) {
     964             :       // By default set destination to XMM, will be set to YMM|ZMM if needed.
     965             :       dst.setSignature(X86Reg::signatureOfT<X86Reg::kRegXmm>());
     966             : 
     967             :       // NOTE: This will hurt if `avxEnabled`.
     968           0 :       if (X86Reg::isMm(src)) {
     969             :         // 64-bit move.
     970             :         instId = X86Inst::kIdMovq2dq;
     971             :         break;
     972             :       }
     973             : 
     974             :       // Argument conversion.
     975             :       uint32_t dstElement = TypeId::elementOf(dstTypeId);
     976             :       uint32_t srcElement = TypeId::elementOf(srcTypeId);
     977             : 
     978           0 :       if (dstElement == TypeId::kF32 && srcElement == TypeId::kF64) {
     979           0 :         srcSize = std::min(dstSize * 2, srcSize);
     980           0 :         dstSize = srcSize / 2;
     981             : 
     982           0 :         if (srcSize <= 8)
     983           0 :           instId = avxEnabled ? X86Inst::kIdVcvtss2sd : X86Inst::kIdCvtss2sd;
     984             :         else
     985           0 :           instId = avxEnabled ? X86Inst::kIdVcvtps2pd : X86Inst::kIdCvtps2pd;
     986             : 
     987           0 :         if (dstSize == 32)
     988             :           dst.setSignature(X86Reg::signatureOfT<X86Reg::kRegYmm>());
     989           0 :         if (src.isReg())
     990             :           src.setSignature(X86Reg::signatureOfVecBySize(srcSize));
     991             :         break;
     992             :       }
     993             : 
     994           0 :       if (dstElement == TypeId::kF64 && srcElement == TypeId::kF32) {
     995           0 :         srcSize = std::min(dstSize, srcSize * 2) / 2;
     996           0 :         dstSize = srcSize * 2;
     997             : 
     998           0 :         if (srcSize <= 4)
     999           0 :           instId = avxEnabled ? X86Inst::kIdVcvtsd2ss : X86Inst::kIdCvtsd2ss;
    1000             :         else
    1001           0 :           instId = avxEnabled ? X86Inst::kIdVcvtpd2ps : X86Inst::kIdCvtpd2ps;
    1002             : 
    1003             :         dst.setSignature(X86Reg::signatureOfVecBySize(dstSize));
    1004           0 :         if (src.isReg() && srcSize >= 32)
    1005             :           src.setSignature(X86Reg::signatureOfT<X86Reg::kRegYmm>());
    1006             :         break;
    1007             :       }
    1008             : 
    1009           0 :       srcSize = std::min(srcSize, dstSize);
    1010           0 :       if (X86Reg::isGp(src) || src.isMem()) {
    1011             :         // 32-bit move.
    1012           0 :         if (srcSize <= 4) {
    1013           0 :           instId = avxEnabled ? X86Inst::kIdVmovd : X86Inst::kIdMovd;
    1014           0 :           if (src.isReg()) src.setSignature(X86Reg::signatureOfT<X86Reg::kRegGpd>());
    1015             :           break;
    1016             :         }
    1017             : 
    1018             :         // 64-bit move.
    1019           0 :         if (srcSize == 8) {
    1020           0 :           instId = avxEnabled ? X86Inst::kIdVmovq : X86Inst::kIdMovq;
    1021             :           break;
    1022             :         }
    1023             :       }
    1024             : 
    1025           0 :       if (X86Reg::isVec(src) || src.isMem()) {
    1026           0 :         instId = avxEnabled ? X86Inst::kIdVmovaps : X86Inst::kIdMovaps;
    1027             :         uint32_t sign = X86Reg::signatureOfVecBySize(srcSize);
    1028             : 
    1029             :         dst.setSignature(sign);
    1030           0 :         if (src.isReg()) src.setSignature(sign);
    1031             :         break;
    1032             :       }
    1033             :     }
    1034             : 
    1035             :     return DebugUtils::errored(kErrorInvalidState);
    1036             :   }
    1037             : 
    1038           0 :   if (src.isMem())
    1039           0 :     src.as<X86Mem>().setSize(srcSize);
    1040             : 
    1041             :   emitter->setInlineComment(comment);
    1042           0 :   return emitter->emit(instId, dst, src);
    1043             : }
    1044             : 
    1045             : // ============================================================================
    1046             : // [asmjit::X86Internal - Emit Prolog & Epilog]
    1047             : // ============================================================================
    1048             : 
    1049       32111 : ASMJIT_FAVOR_SIZE Error X86Internal::emitProlog(X86Emitter* emitter, const FuncFrameLayout& layout) {
    1050             :   uint32_t gpSaved = layout.getSavedRegs(X86Reg::kKindGp);
    1051             : 
    1052             :   X86Gp zsp = emitter->zsp();   // ESP|RSP register.
    1053             :   X86Gp zbp = emitter->zbp();   // EBP|RBP register.
    1054             :   X86Gp gpReg = emitter->zsp(); // General purpose register (temporary).
    1055             :   X86Gp saReg = emitter->zsp(); // Stack-arguments base register.
    1056             : 
    1057             :   // Emit: 'push zbp'
    1058             :   //       'mov  zbp, zsp'.
    1059       32111 :   if (layout.hasPreservedFP()) {
    1060           0 :     gpSaved &= ~Utils::mask(X86Gp::kIdBp);
    1061           0 :     ASMJIT_PROPAGATE(emitter->push(zbp));
    1062           0 :     ASMJIT_PROPAGATE(emitter->mov(zbp, zsp));
    1063             :   }
    1064             : 
    1065             :   // Emit: 'push gp' sequence.
    1066       32111 :   if (gpSaved) {
    1067       35546 :     for (uint32_t i = gpSaved, regId = 0; i; i >>= 1, regId++) {
    1068       28468 :       if (!(i & 0x1)) continue;
    1069             :       gpReg.setId(regId);
    1070        7156 :       ASMJIT_PROPAGATE(emitter->push(gpReg));
    1071             :     }
    1072             :   }
    1073             : 
    1074             :   // Emit: 'mov saReg, zsp'.
    1075             :   uint32_t stackArgsRegId = layout.getStackArgsRegId();
    1076       32111 :   if (stackArgsRegId != Globals::kInvalidRegId && stackArgsRegId != X86Gp::kIdSp) {
    1077             :     saReg.setId(stackArgsRegId);
    1078           0 :     if (!(layout.hasPreservedFP() && stackArgsRegId == X86Gp::kIdBp))
    1079           0 :       ASMJIT_PROPAGATE(emitter->mov(saReg, zsp));
    1080             :   }
    1081             : 
    1082             :   // Emit: 'and zsp, StackAlignment'.
    1083       32111 :   if (layout.hasDynamicAlignment())
    1084           0 :     ASMJIT_PROPAGATE(emitter->and_(zsp, -static_cast<int32_t>(layout.getStackAlignment())));
    1085             : 
    1086             :   // Emit: 'sub zsp, StackAdjustment'.
    1087       32111 :   if (layout.hasStackAdjustment())
    1088        4712 :     ASMJIT_PROPAGATE(emitter->sub(zsp, layout.getStackAdjustment()));
    1089             : 
    1090             :   // Emit: 'mov [zsp + dsaSlot], saReg'.
    1091       32111 :   if (layout.hasDynamicAlignment() && layout.hasDsaSlotUsed()) {
    1092           0 :     X86Mem saMem = x86::ptr(zsp, layout._dsaSlot);
    1093           0 :     ASMJIT_PROPAGATE(emitter->mov(saMem, saReg));
    1094             :   }
    1095             : 
    1096             :   // Emit 'movaps|movups [zsp + X], xmm0..15'.
    1097             :   uint32_t xmmSaved = layout.getSavedRegs(X86Reg::kKindVec);
    1098       32111 :   if (xmmSaved) {
    1099             :     X86Mem vecBase = x86::ptr(zsp, layout.getVecStackOffset());
    1100             :     X86Reg vecReg = x86::xmm(0);
    1101             : 
    1102             :     uint32_t vecInst = x86GetXmmMovInst(layout);
    1103             :     uint32_t vecSize = 16;
    1104             : 
    1105           0 :     for (uint32_t i = xmmSaved, regId = 0; i; i >>= 1, regId++) {
    1106           0 :       if (!(i & 0x1)) continue;
    1107             :       vecReg.setId(regId);
    1108           0 :       ASMJIT_PROPAGATE(emitter->emit(vecInst, vecBase, vecReg));
    1109             :       vecBase.addOffsetLo32(static_cast<int32_t>(vecSize));
    1110             :     }
    1111             :   }
    1112             : 
    1113             :   return kErrorOk;
    1114             : }
    1115             : 
    1116       32111 : ASMJIT_FAVOR_SIZE Error X86Internal::emitEpilog(X86Emitter* emitter, const FuncFrameLayout& layout) {
    1117             :   uint32_t i;
    1118             :   uint32_t regId;
    1119             : 
    1120             :   uint32_t gpSize = emitter->getGpSize();
    1121             :   uint32_t gpSaved = layout.getSavedRegs(X86Reg::kKindGp);
    1122             : 
    1123             :   X86Gp zsp = emitter->zsp();   // ESP|RSP register.
    1124             :   X86Gp zbp = emitter->zbp();   // EBP|RBP register.
    1125             :   X86Gp gpReg = emitter->zsp(); // General purpose register (temporary).
    1126             : 
    1127             :   // Don't emit 'pop zbp' in the pop sequence, this case is handled separately.
    1128       32111 :   if (layout.hasPreservedFP()) gpSaved &= ~Utils::mask(X86Gp::kIdBp);
    1129             : 
    1130             :   // Emit 'movaps|movups xmm0..15, [zsp + X]'.
    1131             :   uint32_t xmmSaved = layout.getSavedRegs(X86Reg::kKindVec);
    1132       32111 :   if (xmmSaved) {
    1133             :     X86Mem vecBase = x86::ptr(zsp, layout.getVecStackOffset());
    1134             :     X86Reg vecReg = x86::xmm(0);
    1135             : 
    1136             :     uint32_t vecInst = x86GetXmmMovInst(layout);
    1137             :     uint32_t vecSize = 16;
    1138             : 
    1139           0 :     for (i = xmmSaved, regId = 0; i; i >>= 1, regId++) {
    1140           0 :       if (!(i & 0x1)) continue;
    1141             :       vecReg.setId(regId);
    1142           0 :       ASMJIT_PROPAGATE(emitter->emit(vecInst, vecReg, vecBase));
    1143             :       vecBase.addOffsetLo32(static_cast<int32_t>(vecSize));
    1144             :     }
    1145             :   }
    1146             : 
    1147             :   // Emit 'emms' and 'vzeroupper'.
    1148       32111 :   if (layout.hasMmxCleanup()) ASMJIT_PROPAGATE(emitter->emms());
    1149       32111 :   if (layout.hasAvxCleanup()) ASMJIT_PROPAGATE(emitter->vzeroupper());
    1150             : 
    1151       32111 :   if (layout.hasPreservedFP()) {
    1152             :     // Emit 'mov zsp, zbp' or 'lea zsp, [zbp - x]'
    1153           0 :     int32_t count = static_cast<int32_t>(layout.getGpStackSize() - gpSize);
    1154           0 :     if (!count)
    1155           0 :       ASMJIT_PROPAGATE(emitter->mov(zsp, zbp));
    1156             :     else
    1157           0 :       ASMJIT_PROPAGATE(emitter->lea(zsp, x86::ptr(zbp, -count)));
    1158             :   }
    1159             :   else {
    1160       32111 :     if (layout.hasDynamicAlignment() && layout.hasDsaSlotUsed()) {
    1161             :       // Emit 'mov zsp, [zsp + DsaSlot]'.
    1162           0 :       X86Mem saMem = x86::ptr(zsp, layout._dsaSlot);
    1163           0 :       ASMJIT_PROPAGATE(emitter->mov(zsp, saMem));
    1164             :     }
    1165       32111 :     else if (layout.hasStackAdjustment()) {
    1166             :       // Emit 'add zsp, StackAdjustment'.
    1167        4712 :       ASMJIT_PROPAGATE(emitter->add(zsp, static_cast<int32_t>(layout.getStackAdjustment())));
    1168             :     }
    1169             :   }
    1170             : 
    1171             :   // Emit 'pop gp' sequence.
    1172       32111 :   if (gpSaved) {
    1173             :     i = gpSaved;
    1174             :     regId = 16;
    1175             : 
    1176             :     do {
    1177      113248 :       regId--;
    1178      113248 :       if (i & 0x8000) {
    1179             :         gpReg.setId(regId);
    1180        7156 :         ASMJIT_PROPAGATE(emitter->pop(gpReg));
    1181             :       }
    1182      113248 :       i <<= 1;
    1183      113248 :     } while (regId != 0);
    1184             :   }
    1185             : 
    1186             :   // Emit 'pop zbp'.
    1187       32111 :   if (layout.hasPreservedFP()) ASMJIT_PROPAGATE(emitter->pop(zbp));
    1188             : 
    1189             :   // Emit 'ret' or 'ret x'.
    1190       32111 :   if (layout.hasCalleeStackCleanup())
    1191           0 :     ASMJIT_PROPAGATE(emitter->emit(X86Inst::kIdRet, static_cast<int>(layout.getCalleeStackCleanup())));
    1192             :   else
    1193       32111 :     ASMJIT_PROPAGATE(emitter->emit(X86Inst::kIdRet));
    1194             : 
    1195             :   return kErrorOk;
    1196             : }
    1197             : 
    1198             : // ============================================================================
    1199             : // [asmjit::X86Internal - AllocArgs]
    1200             : // ============================================================================
    1201             : 
    1202           0 : ASMJIT_FAVOR_SIZE Error X86Internal::allocArgs(X86Emitter* emitter, const FuncFrameLayout& layout, const FuncArgsMapper& args) {
    1203             :   typedef X86FuncArgsContext::SrcArg SrcArg;
    1204             :   typedef X86FuncArgsContext::DstArg DstArg;
    1205             :   typedef X86FuncArgsContext::WorkData WorkData;
    1206             :   enum { kMaxVRegKinds = Globals::kMaxVRegKinds };
    1207             : 
    1208             :   uint32_t i;
    1209             :   const FuncDetail& func = *args.getFuncDetail();
    1210             : 
    1211           0 :   X86FuncArgsContext ctx;
    1212           0 :   ASMJIT_PROPAGATE(ctx.initWorkData(args, layout._savedRegs, layout.hasPreservedFP()));
    1213             : 
    1214             :   // We must honor AVX if it's enabled.
    1215             :   bool avxEnabled = layout.isAvxEnabled();
    1216             : 
    1217             :   // Free registers that can be used as temporaries and during shuffling.
    1218             :   // We initialize them to match all workRegs (registers that can be used
    1219             :   // by the function) except source regs, which are used to pass arguments.
    1220             :   // Free registers are changed during shuffling - when an argument is moved
    1221             :   // to the final register then the register itself is removed from freeRegs
    1222             :   // (it can't be altered anymore during shuffling).
    1223             :   uint32_t freeRegs[kMaxVRegKinds];
    1224           0 :   for (i = 0; i < kMaxVRegKinds; i++)
    1225           0 :     freeRegs[i] = ctx._workData[i].workRegs & ~ctx._workData[i].srcRegs;
    1226             : 
    1227             :   // This is an iterative process that runs until there is a work to do. When
    1228             :   // one register is moved it can create space for another move. Such moves can
    1229             :   // depend on each other so the algorithm may run multiple times before all
    1230             :   // arguments are in place. This part does only register-to-register work,
    1231             :   // arguments moved from stack-to-register area handled later.
    1232             :   for (;;) {
    1233             :     bool hasWork = false; // Do we have a work to do?
    1234             :     bool didWork = false; // If we did something...
    1235             : 
    1236             :     uint32_t dstRegKind = kMaxVRegKinds;
    1237             :     do {
    1238           0 :       WorkData& wd = ctx._workData[--dstRegKind];
    1239           0 :       if (wd.numOps > wd.numStackArgs) {
    1240             :         hasWork = true;
    1241             : 
    1242             :         // Iterate over all destination regs and check if we can do something.
    1243             :         // We always go from destination to source, never the opposite.
    1244           0 :         uint32_t regsToDo = wd.dstRegs;
    1245             :         do {
    1246             :           // If there is a work to do there has to be at least one dstReg.
    1247             :           ASMJIT_ASSERT(regsToDo != 0);
    1248             :           uint32_t dstRegId = Utils::findFirstBit(regsToDo);
    1249             :           uint32_t dstRegMask = Utils::mask(dstRegId);
    1250             : 
    1251           0 :           uint32_t argIndex = wd.argIndex[dstRegId];
    1252           0 :           const DstArg& dstArg = args.getArg(argIndex);
    1253             :           const SrcArg& srcArg = func.getArg(argIndex);
    1254             : 
    1255           0 :           if (srcArg.byReg()) {
    1256           0 :             uint32_t srcRegType = srcArg.getRegType();
    1257             :             uint32_t srcRegKind = X86Reg::kindOf(srcRegType);
    1258             : 
    1259           0 :             if (freeRegs[dstRegKind] & dstRegMask) {
    1260             :               X86Reg dstReg(X86Reg::fromTypeAndId(dstArg.getRegType(), dstRegId));
    1261             :               X86Reg srcReg(X86Reg::fromTypeAndId(srcRegType, srcArg.getRegId()));
    1262             : 
    1263           0 :               ASMJIT_PROPAGATE(
    1264             :                 emitArgMove(emitter,
    1265             :                   dstReg, dstArg.getTypeId(),
    1266             :                   srcReg, srcArg.getTypeId(), avxEnabled));
    1267           0 :               freeRegs[dstRegKind] ^= dstRegMask;                     // Make the DST reg occupied.
    1268           0 :               freeRegs[srcRegKind] |= Utils::mask(srcArg.getRegId()); // Make the SRC reg free.
    1269             : 
    1270             :               ASMJIT_ASSERT(wd.numOps >= 1);
    1271           0 :               wd.numOps--;
    1272             :               didWork = true;
    1273             :             }
    1274             :             else {
    1275             :               // Check if this is a swap operation.
    1276           0 :               if (dstRegKind == srcRegKind) {
    1277             :                 uint32_t srcRegId = srcArg.getRegId();
    1278             : 
    1279           0 :                 uint32_t otherIndex = wd.argIndex[srcRegId];
    1280           0 :                 const DstArg& otherArg = args.getArg(otherIndex);
    1281             : 
    1282           0 :                 if (otherArg.getRegId() == srcRegId && X86Reg::kindOf(otherArg.getRegType()) == dstRegKind) {
    1283             :                   // If this is GP reg it can be handled by 'xchg'.
    1284           0 :                   if (dstRegKind == X86Reg::kKindGp) {
    1285           0 :                     uint32_t highestType = std::max(dstArg.getRegType(), srcRegType);
    1286             : 
    1287           0 :                     X86Reg dstReg = x86::gpd(dstRegId);
    1288           0 :                     X86Reg srcReg = x86::gpd(srcRegId);
    1289             : 
    1290           0 :                     if (highestType == X86Reg::kRegGpq) {
    1291             :                       dstReg.setSignature(X86RegTraits<X86Reg::kRegGpq>::kSignature);
    1292             :                       srcReg.setSignature(X86RegTraits<X86Reg::kRegGpq>::kSignature);
    1293             :                     }
    1294           0 :                     ASMJIT_PROPAGATE(emitter->emit(X86Inst::kIdXchg, dstReg, srcReg));
    1295           0 :                     regsToDo &= ~Utils::mask(srcRegId);
    1296           0 :                     freeRegs[dstRegKind] &= ~(Utils::mask(srcRegId) | dstRegMask);
    1297             : 
    1298             :                     ASMJIT_ASSERT(wd.numOps >= 2);
    1299             :                     ASMJIT_ASSERT(wd.numSwaps >= 1);
    1300           0 :                     wd.numOps-=2;
    1301           0 :                     wd.numSwaps--;
    1302             :                     didWork = true;
    1303             :                   }
    1304             :                 }
    1305             :               }
    1306             :             }
    1307             :           }
    1308             : 
    1309             :           // Clear the reg in `regsToDo` and continue if there are more.
    1310           0 :           regsToDo ^= dstRegMask;
    1311           0 :         } while (regsToDo);
    1312             :       }
    1313           0 :     } while (dstRegKind);
    1314             : 
    1315           0 :     if (!hasWork)
    1316             :       break;
    1317             : 
    1318           0 :     if (!didWork)
    1319             :       return DebugUtils::errored(kErrorInvalidState);
    1320             :   }
    1321             : 
    1322             :   // Load arguments passed by stack into registers. This is pretty simple and
    1323             :   // it never requires multiple iterations like the previous phase.
    1324           0 :   if (ctx._hasStackArgs) {
    1325             :     // Base address of all arguments passed by stack.
    1326             :     X86Mem saBase = x86::ptr(emitter->gpz(layout.getStackArgsRegId()), layout.getStackArgsOffset());
    1327             : 
    1328             :     uint32_t dstRegKind = kMaxVRegKinds;
    1329             :     do {
    1330           0 :       WorkData& wd = ctx._workData[--dstRegKind];
    1331           0 :       if (wd.numStackArgs) {
    1332             :         // Iterate over all destination regs and check if we can do something.
    1333             :         // We always go from destination to source, never the opposite.
    1334           0 :         uint32_t regsToDo = wd.dstRegs;
    1335             :         do {
    1336             :           // If there is a work to do there has to be at least one dstReg.
    1337             :           ASMJIT_ASSERT(regsToDo != 0);
    1338             :           ASMJIT_ASSERT(wd.numOps > 0);
    1339             : 
    1340             :           uint32_t dstRegId = Utils::findFirstBit(regsToDo);
    1341             :           uint32_t dstRegMask = Utils::mask(dstRegId);
    1342             : 
    1343           0 :           uint32_t argIndex = wd.argIndex[dstRegId];
    1344           0 :           const DstArg& dstArg = args.getArg(argIndex);
    1345             :           const SrcArg& srcArg = func.getArg(argIndex);
    1346             : 
    1347             :           // Only arguments passed by stack should remain, also the destination
    1348             :           // registers must be free now (otherwise the first part of the algorithm
    1349             :           // failed). Ideally this should be assert, but it's much safer to enforce
    1350             :           // this in release as well.
    1351           0 :           if (!srcArg.byStack() || !(freeRegs[dstRegKind] & dstRegMask))
    1352           0 :             return DebugUtils::errored(kErrorInvalidState);
    1353             : 
    1354             :           X86Reg dstReg = X86Reg::fromTypeAndId(dstArg.getRegType(), dstRegId);
    1355             :           X86Mem srcMem = saBase.adjusted(srcArg.getStackOffset());
    1356             : 
    1357           0 :           ASMJIT_PROPAGATE(
    1358             :             emitArgMove(emitter,
    1359             :               dstReg, dstArg.getTypeId(),
    1360             :               srcMem, srcArg.getTypeId(), avxEnabled));
    1361             : 
    1362           0 :           freeRegs[dstRegKind] ^= dstRegMask;
    1363           0 :           regsToDo ^= dstRegMask;
    1364           0 :           wd.numOps--;
    1365           0 :         } while (regsToDo);
    1366             :       }
    1367           0 :     } while (dstRegKind);
    1368             :   }
    1369             : 
    1370             :   return kErrorOk;
    1371             : }
    1372             : 
    1373             : } // asmjit namespace
    1374             : } // namespace PLMD
    1375             : 
    1376             : // [Api-End]
    1377             : #include "./asmjit_apiend.h"
    1378             : 
    1379             : // [Guard]
    1380             : #endif // ASMJIT_BUILD_X86
    1381             : #pragma GCC diagnostic pop
    1382             : #endif // __PLUMED_HAS_ASMJIT

Generated by: LCOV version 1.16