Line data Source code
1 : /* +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
2 : Copyright (c) 2008-2017, Petr Kobalicek
3 :
4 : This software is provided 'as-is', without any express or implied
5 : warranty. In no event will the authors be held liable for any damages
6 : arising from the use of this software.
7 :
8 : Permission is granted to anyone to use this software for any purpose,
9 : including commercial applications, and to alter it and redistribute it
10 : freely, subject to the following restrictions:
11 :
12 : 1. The origin of this software must not be misrepresented; you must not
13 : claim that you wrote the original software. If you use this software
14 : in a product, an acknowledgment in the product documentation would be
15 : appreciated but is not required.
16 : 2. Altered source versions must be plainly marked as such, and must not be
17 : misrepresented as being the original software.
18 : 3. This notice may not be removed or altered from any source distribution.
19 : +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ */
20 : #ifdef __PLUMED_HAS_ASMJIT
21 : #pragma GCC diagnostic push
22 : #pragma GCC diagnostic ignored "-Wpedantic"
23 : // [AsmJit]
24 : // Complete x86/x64 JIT and Remote Assembler for C++.
25 : //
26 : // [License]
27 : // Zlib - See LICENSE.md file in the package.
28 :
29 : // [Export]
30 : #define ASMJIT_EXPORTS
31 :
32 : // [Guard]
33 : #include "./asmjit_build.h"
34 : #if defined(ASMJIT_BUILD_X86)
35 :
36 : // [Dependencies]
37 : #include "./x86internal_p.h"
38 :
39 : // [Api-Begin]
40 : #include "./asmjit_apibegin.h"
41 :
42 : namespace PLMD {
43 : namespace asmjit {
44 :
45 : // ============================================================================
46 : // [asmjit::X86Internal - Helpers]
47 : // ============================================================================
48 :
49 : static ASMJIT_INLINE uint32_t x86GetXmmMovInst(const FuncFrameLayout& layout) {
50 : bool avx = layout.isAvxEnabled();
51 : bool aligned = layout.hasAlignedVecSR();
52 :
53 0 : return aligned ? (avx ? X86Inst::kIdVmovaps : X86Inst::kIdMovaps)
54 : : (avx ? X86Inst::kIdVmovups : X86Inst::kIdMovups);
55 : }
56 :
57 : static ASMJIT_INLINE uint32_t x86VecTypeIdToRegType(uint32_t typeId) noexcept {
58 1446 : return typeId <= TypeId::_kVec128End ? X86Reg::kRegXmm :
59 : typeId <= TypeId::_kVec256End ? X86Reg::kRegYmm :
60 : X86Reg::kRegZmm ;
61 : }
62 :
63 : // ============================================================================
64 : // [asmjit::X86FuncArgsContext]
65 : // ============================================================================
66 :
67 : // Used by both, `Utils::argsToFrameInfo()` and `Utils::allocArgs()`.
68 : class X86FuncArgsContext {
69 : public:
70 : typedef FuncDetail::Value SrcArg;
71 : typedef FuncArgsMapper::Value DstArg;
72 :
73 : enum { kMaxVRegKinds = Globals::kMaxVRegKinds };
74 :
75 : struct WorkData {
76 : uint32_t archRegs; //!< Architecture provided and allocable regs.
77 : uint32_t workRegs; //!< Registers that can be used by shuffler.
78 : uint32_t usedRegs; //!< Only registers used to pass arguments.
79 : uint32_t srcRegs; //!< Source registers that need shuffling.
80 : uint32_t dstRegs; //!< Destination registers that need shuffling.
81 : uint8_t numOps; //!< Number of operations to finish.
82 : uint8_t numSwaps; //!< Number of register swaps.
83 : uint8_t numStackArgs; //!< Number of stack loads.
84 : uint8_t reserved[9]; //!< Reserved (only used as padding).
85 : uint8_t argIndex[32]; //!< Only valid if a corresponding bit in `userRegs` is true.
86 : };
87 :
88 : X86FuncArgsContext() noexcept;
89 : Error initWorkData(const FuncArgsMapper& args, const uint32_t* dirtyRegs, bool preservedFP) noexcept;
90 :
91 : Error markRegsForSwaps(FuncFrameInfo& ffi) noexcept;
92 : Error markDstRegsDirty(FuncFrameInfo& ffi) noexcept;
93 : Error markStackArgsReg(FuncFrameInfo& ffi) noexcept;
94 :
95 : // --------------------------------------------------------------------------
96 : // [Members]
97 : // --------------------------------------------------------------------------
98 :
99 : WorkData _workData[kMaxVRegKinds];
100 : bool _hasStackArgs;
101 : bool _hasRegSwaps;
102 : };
103 :
104 0 : X86FuncArgsContext::X86FuncArgsContext() noexcept {
105 0 : ::memset(_workData, 0, sizeof(_workData));
106 0 : _hasStackArgs = false;
107 0 : _hasRegSwaps = false;
108 0 : }
109 :
110 0 : ASMJIT_FAVOR_SIZE Error X86FuncArgsContext::initWorkData(const FuncArgsMapper& args, const uint32_t* dirtyRegs, bool preservedFP) noexcept {
111 : // This code has to be updated if this changes.
112 : ASMJIT_ASSERT(kMaxVRegKinds == 4);
113 :
114 : uint32_t i;
115 : const FuncDetail& func = *args.getFuncDetail();
116 :
117 : uint32_t archType = func.getCallConv().getArchType();
118 0 : uint32_t count = (archType == ArchInfo::kTypeX86) ? 8 : 16;
119 :
120 : // Initialize WorkData::archRegs.
121 0 : _workData[X86Reg::kKindGp ].archRegs = Utils::bits(count) & ~Utils::mask(X86Gp::kIdSp);
122 0 : _workData[X86Reg::kKindMm ].archRegs = Utils::bits(8);
123 0 : _workData[X86Reg::kKindK ].archRegs = Utils::bits(8);
124 0 : _workData[X86Reg::kKindVec].archRegs = Utils::bits(count);
125 :
126 0 : if (preservedFP)
127 0 : _workData[X86Reg::kKindGp].archRegs &= ~Utils::mask(X86Gp::kIdBp);
128 :
129 : // Initialize WorkData::workRegs.
130 0 : for (i = 0; i < kMaxVRegKinds; i++)
131 0 : _workData[i].workRegs = _workData[i].archRegs & (dirtyRegs[i] | ~func.getCallConv().getPreservedRegs(i));
132 :
133 : // Build WorkData.
134 0 : for (i = 0; i < kFuncArgCountLoHi; i++) {
135 0 : const DstArg& dstArg = args.getArg(i);
136 0 : if (!dstArg.isAssigned()) continue;
137 :
138 : const SrcArg& srcArg = func.getArg(i);
139 0 : if (ASMJIT_UNLIKELY(!srcArg.isAssigned()))
140 : return DebugUtils::errored(kErrorInvalidState);
141 :
142 : uint32_t dstRegType = dstArg.getRegType();
143 0 : if (ASMJIT_UNLIKELY(dstRegType >= X86Reg::kRegCount))
144 : return DebugUtils::errored(kErrorInvalidRegType);
145 :
146 : uint32_t dstRegKind = X86Reg::kindOf(dstRegType);
147 0 : if (ASMJIT_UNLIKELY(dstRegKind >= kMaxVRegKinds))
148 : return DebugUtils::errored(kErrorInvalidState);
149 :
150 : WorkData& dstData = _workData[dstRegKind];
151 : uint32_t dstRegId = dstArg.getRegId();
152 0 : if (ASMJIT_UNLIKELY(dstRegId >= 32 || !(dstData.archRegs & Utils::mask(dstRegId))))
153 : return DebugUtils::errored(kErrorInvalidPhysId);
154 :
155 : uint32_t dstRegMask = Utils::mask(dstRegId);
156 0 : if (ASMJIT_UNLIKELY(dstData.usedRegs & dstRegMask))
157 : return DebugUtils::errored(kErrorOverlappedRegs);
158 :
159 0 : dstData.usedRegs |= dstRegMask;
160 0 : dstData.argIndex[dstRegId] = static_cast<uint8_t>(i);
161 :
162 0 : if (srcArg.byReg()) {
163 : uint32_t srcRegKind = X86Reg::kindOf(srcArg.getRegType());
164 : uint32_t srcRegId = srcArg.getRegId();
165 : uint32_t srcRegMask = Utils::mask(srcRegId);
166 :
167 0 : if (dstRegKind == srcRegKind) {
168 : // The best case, register is allocated where it is expected to be.
169 0 : if (dstRegId == srcRegId) continue;
170 :
171 : // Detect a register swap.
172 0 : if (dstData.usedRegs & srcRegMask) {
173 0 : const SrcArg& ref = func.getArg(dstData.argIndex[srcRegId]);
174 0 : if (ref.byReg() && X86Reg::kindOf(ref.getRegType()) == dstRegKind && ref.getRegId() == dstRegId) {
175 0 : dstData.numSwaps++;
176 0 : _hasRegSwaps = true;
177 : }
178 : }
179 0 : dstData.srcRegs |= srcRegMask;
180 : }
181 : else {
182 0 : if (ASMJIT_UNLIKELY(srcRegKind >= kMaxVRegKinds))
183 : return DebugUtils::errored(kErrorInvalidState);
184 :
185 : WorkData& srcData = _workData[srcRegKind];
186 0 : srcData.srcRegs |= srcRegMask;
187 : }
188 : }
189 : else {
190 0 : dstData.numStackArgs++;
191 0 : _hasStackArgs = true;
192 : }
193 :
194 0 : dstData.numOps++;
195 0 : dstData.dstRegs |= dstRegMask;
196 : }
197 :
198 : return kErrorOk;
199 : }
200 :
201 0 : ASMJIT_FAVOR_SIZE Error X86FuncArgsContext::markDstRegsDirty(FuncFrameInfo& ffi) noexcept {
202 0 : for (uint32_t i = 0; i < kMaxVRegKinds; i++) {
203 : WorkData& wd = _workData[i];
204 0 : uint32_t regs = wd.usedRegs | wd.dstRegs;
205 :
206 0 : wd.workRegs |= regs;
207 : ffi.addDirtyRegs(i, regs);
208 : }
209 :
210 0 : return kErrorOk;
211 : }
212 :
213 0 : ASMJIT_FAVOR_SIZE Error X86FuncArgsContext::markRegsForSwaps(FuncFrameInfo& ffi) noexcept {
214 0 : if (!_hasRegSwaps)
215 : return kErrorOk;
216 :
217 : // If some registers require swapping then select one dirty register that
218 : // can be used as a temporary. We can do it also without it (by using xors),
219 : // but using temporary is always safer and also faster approach.
220 0 : for (uint32_t i = 0; i < kMaxVRegKinds; i++) {
221 : // Skip all register kinds where swapping is natively supported (GP regs).
222 0 : if (i == X86Reg::kKindGp) continue;
223 :
224 : // Skip all register kinds that don't require swapping.
225 : WorkData& wd = _workData[i];
226 0 : if (!wd.numSwaps) continue;
227 :
228 : // Initially, pick some clobbered or dirty register.
229 0 : uint32_t workRegs = wd.workRegs;
230 0 : uint32_t regs = workRegs & ~(wd.usedRegs | wd.dstRegs);
231 :
232 : // If that didn't work out pick some register which is not in 'used'.
233 0 : if (!regs) regs = workRegs & ~wd.usedRegs;
234 :
235 : // If that didn't work out pick any other register that is allocable.
236 : // This last resort case will, however, result in marking one more
237 : // register dirty.
238 0 : if (!regs) regs = wd.archRegs & ~workRegs;
239 :
240 : // If that didn't work out we will have to use xors instead of moves.
241 0 : if (!regs) continue;
242 :
243 : uint32_t regMask = Utils::mask(Utils::findFirstBit(regs));
244 0 : wd.workRegs |= regMask;
245 : ffi.addDirtyRegs(i, regMask);
246 : }
247 :
248 : return kErrorOk;
249 : }
250 :
251 0 : ASMJIT_FAVOR_SIZE Error X86FuncArgsContext::markStackArgsReg(FuncFrameInfo& ffi) noexcept {
252 0 : if (!_hasStackArgs)
253 : return kErrorOk;
254 :
255 : // Decide which register to use to hold the stack base address.
256 0 : if (!ffi.hasPreservedFP()) {
257 : WorkData& wd = _workData[X86Reg::kKindGp];
258 : uint32_t saRegId = ffi.getStackArgsRegId();
259 0 : uint32_t usedRegs = wd.usedRegs;
260 :
261 0 : if (saRegId != Globals::kInvalidRegId) {
262 : // Check if the user chosen SA register doesn't overlap with others.
263 : // However, it's fine if it overlaps with some 'dstMove' register.
264 0 : if (usedRegs & Utils::mask(saRegId))
265 : return DebugUtils::errored(kErrorOverlappingStackRegWithRegArg);
266 : }
267 : else {
268 : // Initially, pick some clobbered or dirty register that is neither
269 : // in 'used' and neither in 'dstMove'. That's the safest bet as the
270 : // register won't collide with anything right now.
271 0 : uint32_t regs = wd.workRegs & ~(usedRegs | wd.dstRegs);
272 :
273 : // If that didn't work out pick some register which is not in 'used'.
274 0 : if (!regs) regs = wd.workRegs & ~usedRegs;
275 :
276 : // If that didn't work out then we have to make one more register dirty.
277 0 : if (!regs) regs = wd.archRegs & ~wd.workRegs;
278 :
279 : // If that didn't work out we can't continue.
280 0 : if (ASMJIT_UNLIKELY(!regs))
281 : return DebugUtils::errored(kErrorNoMorePhysRegs);
282 :
283 : saRegId = Utils::findFirstBit(regs);
284 : ffi.setStackArgsRegId(saRegId);
285 : }
286 : }
287 : else {
288 : ffi.setStackArgsRegId(X86Gp::kIdBp);
289 : }
290 :
291 : return kErrorOk;
292 : }
293 :
294 : // ============================================================================
295 : // [asmjit::X86Internal - CallConv]
296 : // ============================================================================
297 :
298 3580 : ASMJIT_FAVOR_SIZE Error X86Internal::initCallConv(CallConv& cc, uint32_t ccId) noexcept {
299 : const uint32_t kKindGp = X86Reg::kKindGp;
300 : const uint32_t kKindVec = X86Reg::kKindVec;
301 : const uint32_t kKindMm = X86Reg::kKindMm;
302 : const uint32_t kKindK = X86Reg::kKindK;
303 :
304 : const uint32_t kZax = X86Gp::kIdAx;
305 : const uint32_t kZbx = X86Gp::kIdBx;
306 : const uint32_t kZcx = X86Gp::kIdCx;
307 : const uint32_t kZdx = X86Gp::kIdDx;
308 : const uint32_t kZsp = X86Gp::kIdSp;
309 : const uint32_t kZbp = X86Gp::kIdBp;
310 : const uint32_t kZsi = X86Gp::kIdSi;
311 : const uint32_t kZdi = X86Gp::kIdDi;
312 :
313 3580 : switch (ccId) {
314 : case CallConv::kIdX86StdCall:
315 : cc.setFlags(CallConv::kFlagCalleePopsStack);
316 0 : goto X86CallConv;
317 :
318 : case CallConv::kIdX86MsThisCall:
319 : cc.setFlags(CallConv::kFlagCalleePopsStack);
320 : cc.setPassedOrder(kKindGp, kZcx);
321 0 : goto X86CallConv;
322 :
323 : case CallConv::kIdX86MsFastCall:
324 : case CallConv::kIdX86GccFastCall:
325 : cc.setFlags(CallConv::kFlagCalleePopsStack);
326 : cc.setPassedOrder(kKindGp, kZcx, kZdx);
327 0 : goto X86CallConv;
328 :
329 : case CallConv::kIdX86GccRegParm1:
330 : cc.setPassedOrder(kKindGp, kZax);
331 0 : goto X86CallConv;
332 :
333 : case CallConv::kIdX86GccRegParm2:
334 : cc.setPassedOrder(kKindGp, kZax, kZdx);
335 0 : goto X86CallConv;
336 :
337 : case CallConv::kIdX86GccRegParm3:
338 : cc.setPassedOrder(kKindGp, kZax, kZdx, kZcx);
339 0 : goto X86CallConv;
340 :
341 : case CallConv::kIdX86CDecl:
342 0 : X86CallConv:
343 : cc.setNaturalStackAlignment(4);
344 : cc.setArchType(ArchInfo::kTypeX86);
345 : cc.setPreservedRegs(kKindGp, Utils::mask(kZbx, kZsp, kZbp, kZsi, kZdi));
346 : break;
347 :
348 : case CallConv::kIdX86Win64:
349 : cc.setArchType(ArchInfo::kTypeX64);
350 : cc.setAlgorithm(CallConv::kAlgorithmWin64);
351 : cc.setFlags(CallConv::kFlagPassFloatsByVec | CallConv::kFlagIndirectVecArgs);
352 : cc.setNaturalStackAlignment(16);
353 : cc.setSpillZoneSize(32);
354 : cc.setPassedOrder(kKindGp, kZcx, kZdx, 8, 9);
355 : cc.setPassedOrder(kKindVec, 0, 1, 2, 3);
356 : cc.setPreservedRegs(kKindGp, Utils::mask(kZbx, kZsp, kZbp, kZsi, kZdi, 12, 13, 14, 15));
357 : cc.setPreservedRegs(kKindVec, Utils::mask(6, 7, 8, 9, 10, 11, 12, 13, 14, 15));
358 : break;
359 :
360 : case CallConv::kIdX86SysV64:
361 : cc.setArchType(ArchInfo::kTypeX64);
362 : cc.setFlags(CallConv::kFlagPassFloatsByVec);
363 : cc.setNaturalStackAlignment(16);
364 : cc.setRedZoneSize(128);
365 : cc.setPassedOrder(kKindGp, kZdi, kZsi, kZdx, kZcx, 8, 9);
366 : cc.setPassedOrder(kKindVec, 0, 1, 2, 3, 4, 5, 6, 7);
367 : cc.setPreservedRegs(kKindGp, Utils::mask(kZbx, kZsp, kZbp, 12, 13, 14, 15));
368 : break;
369 :
370 0 : case CallConv::kIdX86FastEval2:
371 : case CallConv::kIdX86FastEval3:
372 : case CallConv::kIdX86FastEval4: {
373 0 : uint32_t n = ccId - CallConv::kIdX86FastEval2;
374 :
375 : cc.setArchType(ArchInfo::kTypeX86);
376 : cc.setFlags(CallConv::kFlagPassFloatsByVec);
377 : cc.setNaturalStackAlignment(16);
378 : cc.setPassedOrder(kKindGp, kZax, kZdx, kZcx, kZsi, kZdi);
379 : cc.setPassedOrder(kKindMm, 0, 1, 2, 3, 4, 5, 6, 7);
380 : cc.setPassedOrder(kKindVec, 0, 1, 2, 3, 4, 5, 6, 7);
381 :
382 : cc.setPreservedRegs(kKindGp , Utils::bits(8));
383 0 : cc.setPreservedRegs(kKindVec, Utils::bits(8) & ~Utils::bits(n));
384 : cc.setPreservedRegs(kKindMm , Utils::bits(8));
385 : cc.setPreservedRegs(kKindK , Utils::bits(8));
386 : break;
387 : }
388 :
389 0 : case CallConv::kIdX64FastEval2:
390 : case CallConv::kIdX64FastEval3:
391 : case CallConv::kIdX64FastEval4: {
392 0 : uint32_t n = ccId - CallConv::kIdX64FastEval2;
393 :
394 : cc.setArchType(ArchInfo::kTypeX64);
395 : cc.setFlags(CallConv::kFlagPassFloatsByVec);
396 : cc.setNaturalStackAlignment(16);
397 : cc.setPassedOrder(kKindGp, kZax, kZdx, kZcx, kZsi, kZdi);
398 : cc.setPassedOrder(kKindMm, 0, 1, 2, 3, 4, 5, 6, 7);
399 : cc.setPassedOrder(kKindVec, 0, 1, 2, 3, 4, 5, 6, 7);
400 :
401 : cc.setPreservedRegs(kKindGp , Utils::bits(16));
402 0 : cc.setPreservedRegs(kKindVec,~Utils::bits(n));
403 : cc.setPreservedRegs(kKindMm , Utils::bits(8));
404 : cc.setPreservedRegs(kKindK , Utils::bits(8));
405 : break;
406 : }
407 :
408 : default:
409 : return DebugUtils::errored(kErrorInvalidArgument);
410 : }
411 :
412 : cc.setId(ccId);
413 3580 : return kErrorOk;
414 : }
415 :
416 : // ============================================================================
417 : // [asmjit::X86Internal - FuncDetail]
418 : // ============================================================================
419 :
420 3580 : ASMJIT_FAVOR_SIZE Error X86Internal::initFuncDetail(FuncDetail& func, const FuncSignature& sign, uint32_t gpSize) noexcept {
421 : const CallConv& cc = func.getCallConv();
422 : uint32_t archType = cc.getArchType();
423 :
424 : uint32_t i;
425 : uint32_t argCount = func.getArgCount();
426 :
427 3580 : if (func.getRetCount() != 0) {
428 : uint32_t typeId = func._rets[0].getTypeId();
429 3580 : switch (typeId) {
430 0 : case TypeId::kI64:
431 : case TypeId::kU64: {
432 0 : if (archType == ArchInfo::kTypeX86) {
433 : // Convert a 64-bit return to two 32-bit returns.
434 0 : func._retCount = 2;
435 0 : typeId -= 2;
436 :
437 : // 64-bit value is returned in EDX:EAX on X86.
438 : func._rets[0].initReg(typeId, X86Gp::kRegGpd, X86Gp::kIdAx);
439 : func._rets[1].initReg(typeId, X86Gp::kRegGpd, X86Gp::kIdDx);
440 : break;
441 : }
442 : else {
443 : func._rets[0].initReg(typeId, X86Gp::kRegGpq, X86Gp::kIdAx);
444 : }
445 : break;
446 : }
447 :
448 0 : case TypeId::kI8:
449 : case TypeId::kU8:
450 : case TypeId::kI16:
451 : case TypeId::kU16:
452 : case TypeId::kI32:
453 : case TypeId::kU32: {
454 : func._rets[0].assignToReg(X86Gp::kRegGpd, X86Gp::kIdAx);
455 : break;
456 : }
457 :
458 3580 : case TypeId::kF32:
459 : case TypeId::kF64: {
460 3580 : uint32_t regType = (archType == ArchInfo::kTypeX86) ? X86Reg::kRegFp : X86Reg::kRegXmm;
461 : func._rets[0].assignToReg(regType, 0);
462 : break;
463 : }
464 :
465 0 : case TypeId::kF80: {
466 : // 80-bit floats are always returned by FP0.
467 : func._rets[0].assignToReg(X86Reg::kRegFp, 0);
468 : break;
469 : }
470 :
471 0 : case TypeId::kMmx32:
472 : case TypeId::kMmx64: {
473 : // On X64 MM register(s) are returned through XMM or GPQ (Win64).
474 : uint32_t regType = X86Reg::kRegMm;
475 0 : if (archType != ArchInfo::kTypeX86)
476 0 : regType = cc.getAlgorithm() == CallConv::kAlgorithmDefault ? X86Reg::kRegXmm : X86Reg::kRegGpq;
477 :
478 : func._rets[0].assignToReg(regType, 0);
479 : break;
480 : }
481 :
482 0 : default: {
483 : func._rets[0].assignToReg(x86VecTypeIdToRegType(typeId), 0);
484 : break;
485 : }
486 : }
487 : }
488 :
489 3580 : uint32_t stackBase = gpSize;
490 3580 : uint32_t stackOffset = stackBase + cc._spillZoneSize;
491 :
492 3580 : if (cc.getAlgorithm() == CallConv::kAlgorithmDefault) {
493 : uint32_t gpzPos = 0;
494 : uint32_t vecPos = 0;
495 :
496 5430 : for (i = 0; i < argCount; i++) {
497 : FuncDetail::Value& arg = func._args[i];
498 : uint32_t typeId = arg.getTypeId();
499 :
500 1850 : if (TypeId::isInt(typeId)) {
501 404 : uint32_t regId = gpzPos < CallConv::kNumRegArgsPerKind ? cc._passedOrder[X86Reg::kKindGp].id[gpzPos] : Globals::kInvalidRegId;
502 404 : if (regId != Globals::kInvalidRegId) {
503 : uint32_t regType = (typeId <= TypeId::kU32)
504 404 : ? X86Reg::kRegGpd
505 : : X86Reg::kRegGpq;
506 : arg.assignToReg(regType, regId);
507 : func.addUsedRegs(X86Reg::kKindGp, Utils::mask(regId));
508 404 : gpzPos++;
509 : }
510 : else {
511 0 : uint32_t size = std::max<uint32_t>(TypeId::sizeOf(typeId), gpSize);
512 : arg.assignToStack(stackOffset);
513 0 : stackOffset += size;
514 : }
515 404 : continue;
516 404 : }
517 :
518 1446 : if (TypeId::isFloat(typeId) || TypeId::isVec(typeId)) {
519 1446 : uint32_t regId = vecPos < CallConv::kNumRegArgsPerKind ? cc._passedOrder[X86Reg::kKindVec].id[vecPos] : Globals::kInvalidRegId;
520 :
521 : // If this is a float, but `floatByVec` is false, we have to pass by stack.
522 1446 : if (TypeId::isFloat(typeId) && !cc.hasFlag(CallConv::kFlagPassFloatsByVec))
523 : regId = Globals::kInvalidRegId;
524 :
525 1446 : if (regId != Globals::kInvalidRegId) {
526 : arg.initReg(typeId, x86VecTypeIdToRegType(typeId), regId);
527 : func.addUsedRegs(X86Reg::kKindVec, Utils::mask(regId));
528 1446 : vecPos++;
529 : }
530 : else {
531 : int32_t size = TypeId::sizeOf(typeId);
532 : arg.assignToStack(stackOffset);
533 0 : stackOffset += size;
534 : }
535 1446 : continue;
536 1446 : }
537 : }
538 : }
539 :
540 3580 : if (cc.getAlgorithm() == CallConv::kAlgorithmWin64) {
541 0 : for (i = 0; i < argCount; i++) {
542 : FuncDetail::Value& arg = func._args[i];
543 :
544 : uint32_t typeId = arg.getTypeId();
545 : uint32_t size = TypeId::sizeOf(typeId);
546 :
547 0 : if (TypeId::isInt(typeId) || TypeId::isMmx(typeId)) {
548 0 : uint32_t regId = i < CallConv::kNumRegArgsPerKind ? cc._passedOrder[X86Reg::kKindGp].id[i] : Globals::kInvalidRegId;
549 0 : if (regId != Globals::kInvalidRegId) {
550 0 : uint32_t regType = (size <= 4 && !TypeId::isMmx(typeId))
551 0 : ? X86Reg::kRegGpd
552 : : X86Reg::kRegGpq;
553 :
554 : arg.assignToReg(regType, regId);
555 : func.addUsedRegs(X86Reg::kKindGp, Utils::mask(regId));
556 : }
557 : else {
558 : arg.assignToStack(stackOffset);
559 0 : stackOffset += gpSize;
560 : }
561 0 : continue;
562 0 : }
563 :
564 0 : if (TypeId::isFloat(typeId) || TypeId::isVec(typeId)) {
565 0 : uint32_t regId = i < CallConv::kNumRegArgsPerKind ? cc._passedOrder[X86Reg::kKindVec].id[i] : Globals::kInvalidRegId;
566 0 : if (regId != Globals::kInvalidRegId && (TypeId::isFloat(typeId) || cc.hasFlag(CallConv::kFlagVectorCall))) {
567 : uint32_t regType = x86VecTypeIdToRegType(typeId);
568 : uint32_t regId = cc._passedOrder[X86Reg::kKindVec].id[i];
569 :
570 : arg.assignToReg(regType, regId);
571 : func.addUsedRegs(X86Reg::kKindVec, Utils::mask(regId));
572 : }
573 : else {
574 : arg.assignToStack(stackOffset);
575 0 : stackOffset += 8; // Always 8 bytes (float/double).
576 : }
577 0 : continue;
578 0 : }
579 : }
580 : }
581 :
582 3580 : func._argStackSize = stackOffset - stackBase;
583 3580 : return kErrorOk;
584 : }
585 :
586 : // ============================================================================
587 : // [asmjit::X86Internal - FrameLayout]
588 : // ============================================================================
589 :
590 1944 : ASMJIT_FAVOR_SIZE Error X86Internal::initFrameLayout(FuncFrameLayout& layout, const FuncDetail& func, const FuncFrameInfo& ffi) noexcept {
591 : layout.reset();
592 :
593 : uint32_t kind;
594 1944 : uint32_t gpSize = (func.getCallConv().getArchType() == ArchInfo::kTypeX86) ? 4 : 8;
595 :
596 : // Calculate a bit-mask of all registers that must be saved & restored.
597 9720 : for (kind = 0; kind < Globals::kMaxVRegKinds; kind++)
598 7776 : layout._savedRegs[kind] = (ffi.getDirtyRegs(kind) & ~func.getPassedRegs(kind)) & func.getPreservedRegs(kind);
599 :
600 : // Include EBP|RBP if the function preserves the frame-pointer.
601 1944 : if (ffi.hasPreservedFP()) {
602 0 : layout._preservedFP = true;
603 0 : layout._savedRegs[X86Reg::kKindGp] |= Utils::mask(X86Gp::kIdBp);
604 : }
605 :
606 : // Exclude ESP/RSP - this register is never included in saved-regs.
607 1944 : layout._savedRegs[X86Reg::kKindGp] &= ~Utils::mask(X86Gp::kIdSp);
608 :
609 : // Calculate the final stack alignment.
610 : uint32_t stackAlignment =
611 : std::max<uint32_t>(
612 : std::max<uint32_t>(
613 1944 : ffi.getStackFrameAlignment(),
614 1944 : ffi.getCallFrameAlignment()),
615 1944 : func.getCallConv().getNaturalStackAlignment());
616 1944 : layout._stackAlignment = static_cast<uint8_t>(stackAlignment);
617 :
618 : // Calculate if dynamic stack alignment is required. If true the function has
619 : // to align stack dynamically to match `_stackAlignment` and would require to
620 : // access its stack-based arguments through `_stackArgsRegId`.
621 1944 : bool dsa = stackAlignment > func.getCallConv().getNaturalStackAlignment() && stackAlignment >= 16;
622 0 : layout._dynamicAlignment = dsa;
623 :
624 : // This flag describes if the prolog inserter must store the previous ESP|RSP
625 : // to stack so the epilog inserter can load the stack from it before returning.
626 0 : bool dsaSlotUsed = dsa && !ffi.hasPreservedFP();
627 1944 : layout._dsaSlotUsed = dsaSlotUsed;
628 :
629 : // These two are identical if the function doesn't align its stack dynamically.
630 : uint32_t stackArgsRegId = ffi.getStackArgsRegId();
631 1944 : if (stackArgsRegId == Globals::kInvalidRegId)
632 : stackArgsRegId = X86Gp::kIdSp;
633 :
634 : // Fix stack arguments base-register from ESP|RSP to EBP|RBP in case it was
635 : // not picked before and the function performs dynamic stack alignment.
636 1944 : if (dsa && stackArgsRegId == X86Gp::kIdSp)
637 : stackArgsRegId = X86Gp::kIdBp;
638 :
639 1944 : if (stackArgsRegId != X86Gp::kIdSp)
640 0 : layout._savedRegs[X86Reg::kKindGp] |= Utils::mask(stackArgsRegId) & func.getPreservedRegs(X86Gp::kKindGp);
641 :
642 1944 : layout._stackBaseRegId = X86Gp::kIdSp;
643 1944 : layout._stackArgsRegId = static_cast<uint8_t>(stackArgsRegId);
644 :
645 : // Setup stack size used to save preserved registers.
646 1944 : layout._gpStackSize = Utils::bitCount(layout.getSavedRegs(X86Reg::kKindGp )) * gpSize;
647 1944 : layout._vecStackSize = Utils::bitCount(layout.getSavedRegs(X86Reg::kKindVec)) * 16 +
648 : Utils::bitCount(layout.getSavedRegs(X86Reg::kKindMm )) * 8 ;
649 :
650 : uint32_t v = 0; // The beginning of the stack frame, aligned to CallFrame alignment.
651 1944 : v += ffi._callFrameSize; // Count '_callFrameSize' <- This is used to call functions.
652 : v = Utils::alignTo(v, stackAlignment);// Align to function's SA
653 :
654 1944 : layout._stackBaseOffset = v; // Store '_stackBaseOffset'<- Function's own stack starts here..
655 1944 : v += ffi._stackFrameSize; // Count '_stackFrameSize' <- Function's own stack ends here.
656 :
657 : // If the function is aligned, calculate the alignment necessary to store
658 : // vector registers, and set `FuncFrameInfo::kX86FlagAlignedVecSR` to inform
659 : // PrologEpilog inserter that it can use instructions to perform aligned
660 : // stores/loads to save/restore VEC registers.
661 1944 : if (stackAlignment >= 16 && layout._vecStackSize) {
662 : v = Utils::alignTo(v, 16); // Align '_vecStackOffset'.
663 0 : layout._alignedVecSR = true;
664 : }
665 :
666 1944 : layout._vecStackOffset = v; // Store '_vecStackOffset' <- Functions VEC Save|Restore starts here.
667 1944 : v += layout._vecStackSize; // Count '_vecStackSize' <- Functions VEC Save|Restore ends here.
668 :
669 1944 : if (dsaSlotUsed) {
670 0 : layout._dsaSlot = v; // Store '_dsaSlot' <- Old stack pointer is stored here.
671 0 : v += gpSize;
672 : }
673 :
674 : // The return address should be stored after GP save/restore regs. It has
675 : // the same size as `gpSize` (basically the native register/pointer size).
676 : // We don't adjust it now as `v` now contains the exact size that the
677 : // function requires to adjust (call frame + stack frame, vec stack size).
678 : // The stack (if we consider this size) is misaligned now, as it's always
679 : // aligned before the function call - when `call()` is executed it pushes
680 : // the current EIP|RIP onto the stack, and misaligns it by 12 or 8 bytes
681 : // (depending on the architecture). So count number of bytes needed to align
682 : // it up to the function's CallFrame (the beginning).
683 1944 : if (v || ffi.hasCalls())
684 1080 : v += Utils::alignDiff(v + layout._gpStackSize + gpSize, stackAlignment);
685 :
686 1944 : layout._stackAdjustment = v; // Store '_stackAdjustment'<- SA used by 'add zsp, SA' and 'sub zsp, SA'.
687 1944 : layout._gpStackOffset = v; // Store '_gpStackOffset' <- Functions GP Save|Restore starts here.
688 1944 : v += layout._gpStackSize; // Count '_gpStackSize' <- Functions GP Save|Restore ends here.
689 :
690 1944 : v += gpSize; // Count 'ReturnAddress'.
691 1944 : v += func.getSpillZoneSize(); // Count 'SpillZoneSize'.
692 :
693 : // Calculate where function arguments start, relative to the stackArgsRegId.
694 : // If the register that will be used to access arguments passed by stack is
695 : // ESP|RSP then it's exactly where we are now, otherwise we must calculate
696 : // how many 'push regs' we did and adjust it based on that.
697 : uint32_t stackArgsOffset = v;
698 1944 : if (stackArgsRegId != X86Gp::kIdSp) {
699 0 : if (ffi.hasPreservedFP())
700 : stackArgsOffset = gpSize;
701 : else
702 : stackArgsOffset = layout._gpStackSize;
703 : }
704 1944 : layout._stackArgsOffset = stackArgsOffset;
705 :
706 : // If the function does dynamic stack adjustment then the stack-adjustment
707 : // must be aligned.
708 1944 : if (dsa)
709 0 : layout._stackAdjustment = Utils::alignTo(layout._stackAdjustment, stackAlignment);
710 :
711 : // Initialize variables based on CallConv flags.
712 1944 : if (func.hasFlag(CallConv::kFlagCalleePopsStack))
713 0 : layout._calleeStackCleanup = static_cast<uint16_t>(func.getArgStackSize());
714 :
715 : // Initialize variables based on FFI flags.
716 1944 : layout._mmxCleanup = ffi.hasMmxCleanup();
717 1944 : layout._avxEnabled = ffi.isAvxEnabled();
718 1944 : layout._avxCleanup = ffi.hasAvxCleanup();
719 :
720 1944 : return kErrorOk;
721 : }
722 :
723 : // ============================================================================
724 : // [asmjit::X86Internal - ArgsToFrameInfo]
725 : // ============================================================================
726 :
727 0 : ASMJIT_FAVOR_SIZE Error X86Internal::argsToFrameInfo(const FuncArgsMapper& args, FuncFrameInfo& ffi) noexcept {
728 0 : X86FuncArgsContext ctx;
729 0 : ASMJIT_PROPAGATE(ctx.initWorkData(args, ffi._dirtyRegs, ffi.hasPreservedFP()));
730 :
731 0 : ASMJIT_PROPAGATE(ctx.markDstRegsDirty(ffi));
732 0 : ASMJIT_PROPAGATE(ctx.markRegsForSwaps(ffi));
733 0 : ASMJIT_PROPAGATE(ctx.markStackArgsReg(ffi));
734 : return kErrorOk;
735 : }
736 :
737 : // ============================================================================
738 : // [asmjit::X86Internal - Emit Helpers]
739 : // ============================================================================
740 :
741 9084 : ASMJIT_FAVOR_SIZE Error X86Internal::emitRegMove(X86Emitter* emitter,
742 : const Operand_& dst_,
743 : const Operand_& src_, uint32_t typeId, bool avxEnabled, const char* comment) {
744 :
745 : // Invalid or abstract TypeIds are not allowed.
746 : ASMJIT_ASSERT(TypeId::isValid(typeId) && !TypeId::isAbstract(typeId));
747 :
748 : Operand dst(dst_);
749 : Operand src(src_);
750 :
751 : uint32_t instId = Inst::kIdNone;
752 : uint32_t memFlags = 0;
753 :
754 : enum MemFlags {
755 : kDstMem = 0x1,
756 : kSrcMem = 0x2
757 : };
758 :
759 : // Detect memory operands and patch them to have the same size as the register.
760 : // CodeCompiler always sets memory size of allocs and spills, so it shouldn't
761 : // be really necessary, however, after this function was separated from Compiler
762 : // it's better to make sure that the size is always specified, as we can use
763 : // 'movzx' and 'movsx' that rely on it.
764 9084 : if (dst.isMem()) { memFlags |= kDstMem; dst.as<X86Mem>().setSize(src.getSize()); }
765 9084 : if (src.isMem()) { memFlags |= kSrcMem; src.as<X86Mem>().setSize(dst.getSize()); }
766 :
767 9084 : switch (typeId) {
768 0 : case TypeId::kI8:
769 : case TypeId::kU8:
770 : case TypeId::kI16:
771 : case TypeId::kU16:
772 : // Special case - 'movzx' load.
773 0 : if (memFlags & kSrcMem) {
774 : instId = X86Inst::kIdMovzx;
775 : dst.setSignature(X86RegTraits<X86Reg::kRegGpd>::kSignature);
776 : }
777 0 : else if (!memFlags) {
778 : // Change both destination and source registers to GPD (safer, no dependencies).
779 : dst.setSignature(X86RegTraits<X86Reg::kRegGpd>::kSignature);
780 : src.setSignature(X86RegTraits<X86Reg::kRegGpd>::kSignature);
781 : }
782 : ASMJIT_FALLTHROUGH;
783 :
784 : case TypeId::kI32:
785 : case TypeId::kU32:
786 : case TypeId::kI64:
787 : case TypeId::kU64:
788 : instId = X86Inst::kIdMov;
789 : break;
790 :
791 0 : case TypeId::kMmx32:
792 : instId = X86Inst::kIdMovd;
793 0 : if (memFlags) break;
794 : ASMJIT_FALLTHROUGH;
795 0 : case TypeId::kMmx64 : instId = X86Inst::kIdMovq ; break;
796 : case TypeId::kMask8 : instId = X86Inst::kIdKmovb; break;
797 0 : case TypeId::kMask16: instId = X86Inst::kIdKmovw; break;
798 0 : case TypeId::kMask32: instId = X86Inst::kIdKmovd; break;
799 0 : case TypeId::kMask64: instId = X86Inst::kIdKmovq; break;
800 :
801 : default: {
802 : uint32_t elementTypeId = TypeId::elementOf(typeId);
803 9084 : if (TypeId::isVec32(typeId) && memFlags) {
804 0 : if (elementTypeId == TypeId::kF32)
805 0 : instId = avxEnabled ? X86Inst::kIdVmovss : X86Inst::kIdMovss;
806 : else
807 0 : instId = avxEnabled ? X86Inst::kIdVmovd : X86Inst::kIdMovd;
808 : break;
809 : }
810 :
811 9084 : if (TypeId::isVec64(typeId) && memFlags) {
812 8428 : if (elementTypeId == TypeId::kF64)
813 8428 : instId = avxEnabled ? X86Inst::kIdVmovsd : X86Inst::kIdMovsd;
814 : else
815 0 : instId = avxEnabled ? X86Inst::kIdVmovq : X86Inst::kIdMovq;
816 : break;
817 : }
818 :
819 656 : if (elementTypeId == TypeId::kF32)
820 0 : instId = avxEnabled ? X86Inst::kIdVmovaps : X86Inst::kIdMovaps;
821 656 : else if (elementTypeId == TypeId::kF64)
822 656 : instId = avxEnabled ? X86Inst::kIdVmovapd : X86Inst::kIdMovapd;
823 0 : else if (typeId <= TypeId::_kVec256End)
824 0 : instId = avxEnabled ? X86Inst::kIdVmovdqa : X86Inst::kIdMovdqa;
825 0 : else if (elementTypeId <= TypeId::kU32)
826 : instId = X86Inst::kIdVmovdqa32;
827 : else
828 : instId = X86Inst::kIdVmovdqa64;
829 : break;
830 : }
831 : }
832 :
833 : if (!instId)
834 : return DebugUtils::errored(kErrorInvalidState);
835 :
836 : emitter->setInlineComment(comment);
837 9084 : return emitter->emit(instId, dst, src);
838 : }
839 :
840 0 : ASMJIT_FAVOR_SIZE Error X86Internal::emitArgMove(X86Emitter* emitter,
841 : const X86Reg& dst_, uint32_t dstTypeId,
842 : const Operand_& src_, uint32_t srcTypeId, bool avxEnabled, const char* comment) {
843 :
844 : // Deduce optional `dstTypeId`, which may be `TypeId::kVoid` in some cases.
845 0 : if (!dstTypeId) dstTypeId = x86OpData.archRegs.regTypeToTypeId[dst_.getType()];
846 :
847 : // Invalid or abstract TypeIds are not allowed.
848 : ASMJIT_ASSERT(TypeId::isValid(dstTypeId) && !TypeId::isAbstract(dstTypeId));
849 : ASMJIT_ASSERT(TypeId::isValid(srcTypeId) && !TypeId::isAbstract(srcTypeId));
850 :
851 : X86Reg dst(dst_);
852 : Operand src(src_);
853 :
854 0 : uint32_t dstSize = TypeId::sizeOf(dstTypeId);
855 0 : uint32_t srcSize = TypeId::sizeOf(srcTypeId);
856 :
857 : int32_t instId = Inst::kIdNone;
858 :
859 : // Not a real loop, just 'break' is nicer than 'goto'.
860 : for (;;) {
861 0 : if (TypeId::isInt(dstTypeId)) {
862 0 : if (TypeId::isInt(srcTypeId)) {
863 : instId = X86Inst::kIdMovsx;
864 0 : uint32_t typeOp = (dstTypeId << 8) | srcTypeId;
865 :
866 : // Sign extend by using 'movsx'.
867 0 : if (typeOp == ((TypeId::kI16 << 8) | TypeId::kI8 ) ||
868 0 : typeOp == ((TypeId::kI32 << 8) | TypeId::kI8 ) ||
869 0 : typeOp == ((TypeId::kI32 << 8) | TypeId::kI16) ||
870 0 : typeOp == ((TypeId::kI64 << 8) | TypeId::kI8 ) ||
871 : typeOp == ((TypeId::kI64 << 8) | TypeId::kI16)) break;
872 :
873 : // Sign extend by using 'movsxd'.
874 : instId = X86Inst::kIdMovsxd;
875 : if (typeOp == ((TypeId::kI64 << 8) | TypeId::kI32)) break;
876 : }
877 :
878 0 : if (TypeId::isInt(srcTypeId) || src_.isMem()) {
879 : // Zero extend by using 'movzx' or 'mov'.
880 0 : if (dstSize <= 4 && srcSize < 4) {
881 : instId = X86Inst::kIdMovzx;
882 : dst.setSignature(X86Reg::signatureOfT<X86Reg::kRegGpd>());
883 : }
884 : else {
885 : // We should have caught all possibilities where `srcSize` is less
886 : // than 4, so we don't have to worry about 'movzx' anymore. Minimum
887 : // size is enough to determine if we want 32-bit or 64-bit move.
888 : instId = X86Inst::kIdMov;
889 0 : srcSize = std::min(srcSize, dstSize);
890 :
891 0 : dst.setSignature(srcSize == 4 ? X86Reg::signatureOfT<X86Reg::kRegGpd>()
892 : : X86Reg::signatureOfT<X86Reg::kRegGpq>());
893 0 : if (src.isReg()) src.setSignature(dst.getSignature());
894 : }
895 : break;
896 : }
897 :
898 : // NOTE: The previous branch caught all memory sources, from here it's
899 : // always register to register conversion, so catch the remaining cases.
900 0 : srcSize = std::min(srcSize, dstSize);
901 :
902 0 : if (TypeId::isMmx(srcTypeId)) {
903 : // 64-bit move.
904 : instId = X86Inst::kIdMovq;
905 0 : if (srcSize == 8) break;
906 :
907 : // 32-bit move.
908 : instId = X86Inst::kIdMovd;
909 : dst.setSignature(X86Reg::signatureOfT<X86Reg::kRegGpd>());
910 : break;
911 : }
912 :
913 0 : if (TypeId::isMask(srcTypeId)) {
914 0 : instId = X86Inst::kmovIdFromSize(srcSize);
915 0 : dst.setSignature(srcSize <= 4 ? X86Reg::signatureOfT<X86Reg::kRegGpd>()
916 : : X86Reg::signatureOfT<X86Reg::kRegGpq>());
917 : break;
918 : }
919 :
920 0 : if (TypeId::isVec(srcTypeId)) {
921 : // 64-bit move.
922 0 : instId = avxEnabled ? X86Inst::kIdVmovq : X86Inst::kIdMovq;
923 0 : if (srcSize == 8) break;
924 :
925 : // 32-bit move.
926 0 : instId = avxEnabled ? X86Inst::kIdVmovd : X86Inst::kIdMovd;
927 : dst.setSignature(X86Reg::signatureOfT<X86Reg::kRegGpd>());
928 : break;
929 : }
930 : }
931 :
932 0 : if (TypeId::isMmx(dstTypeId)) {
933 : instId = X86Inst::kIdMovq;
934 0 : srcSize = std::min(srcSize, dstSize);
935 :
936 0 : if (TypeId::isInt(srcTypeId) || src.isMem()) {
937 : // 64-bit move.
938 0 : if (srcSize == 8) break;
939 :
940 : // 32-bit move.
941 : instId = X86Inst::kIdMovd;
942 0 : if (src.isReg()) src.setSignature(X86Reg::signatureOfT<X86Reg::kRegGpd>());
943 : break;
944 : }
945 :
946 0 : if (TypeId::isMmx(srcTypeId)) break;
947 :
948 : // NOTE: This will hurt if `avxEnabled`.
949 : instId = X86Inst::kIdMovdq2q;
950 0 : if (TypeId::isVec(srcTypeId)) break;
951 : }
952 :
953 0 : if (TypeId::isMask(dstTypeId)) {
954 0 : srcSize = std::min(srcSize, dstSize);
955 :
956 0 : if (TypeId::isInt(srcTypeId) || TypeId::isMask(srcTypeId) || src.isMem()) {
957 0 : instId = X86Inst::kmovIdFromSize(srcSize);
958 0 : if (X86Reg::isGp(src) && srcSize <= 4) src.setSignature(X86Reg::signatureOfT<X86Reg::kRegGpd>());
959 : break;
960 : }
961 : }
962 :
963 0 : if (TypeId::isVec(dstTypeId)) {
964 : // By default set destination to XMM, will be set to YMM|ZMM if needed.
965 : dst.setSignature(X86Reg::signatureOfT<X86Reg::kRegXmm>());
966 :
967 : // NOTE: This will hurt if `avxEnabled`.
968 0 : if (X86Reg::isMm(src)) {
969 : // 64-bit move.
970 : instId = X86Inst::kIdMovq2dq;
971 : break;
972 : }
973 :
974 : // Argument conversion.
975 : uint32_t dstElement = TypeId::elementOf(dstTypeId);
976 : uint32_t srcElement = TypeId::elementOf(srcTypeId);
977 :
978 0 : if (dstElement == TypeId::kF32 && srcElement == TypeId::kF64) {
979 0 : srcSize = std::min(dstSize * 2, srcSize);
980 0 : dstSize = srcSize / 2;
981 :
982 0 : if (srcSize <= 8)
983 0 : instId = avxEnabled ? X86Inst::kIdVcvtss2sd : X86Inst::kIdCvtss2sd;
984 : else
985 0 : instId = avxEnabled ? X86Inst::kIdVcvtps2pd : X86Inst::kIdCvtps2pd;
986 :
987 0 : if (dstSize == 32)
988 : dst.setSignature(X86Reg::signatureOfT<X86Reg::kRegYmm>());
989 0 : if (src.isReg())
990 : src.setSignature(X86Reg::signatureOfVecBySize(srcSize));
991 : break;
992 : }
993 :
994 0 : if (dstElement == TypeId::kF64 && srcElement == TypeId::kF32) {
995 0 : srcSize = std::min(dstSize, srcSize * 2) / 2;
996 0 : dstSize = srcSize * 2;
997 :
998 0 : if (srcSize <= 4)
999 0 : instId = avxEnabled ? X86Inst::kIdVcvtsd2ss : X86Inst::kIdCvtsd2ss;
1000 : else
1001 0 : instId = avxEnabled ? X86Inst::kIdVcvtpd2ps : X86Inst::kIdCvtpd2ps;
1002 :
1003 : dst.setSignature(X86Reg::signatureOfVecBySize(dstSize));
1004 0 : if (src.isReg() && srcSize >= 32)
1005 : src.setSignature(X86Reg::signatureOfT<X86Reg::kRegYmm>());
1006 : break;
1007 : }
1008 :
1009 0 : srcSize = std::min(srcSize, dstSize);
1010 0 : if (X86Reg::isGp(src) || src.isMem()) {
1011 : // 32-bit move.
1012 0 : if (srcSize <= 4) {
1013 0 : instId = avxEnabled ? X86Inst::kIdVmovd : X86Inst::kIdMovd;
1014 0 : if (src.isReg()) src.setSignature(X86Reg::signatureOfT<X86Reg::kRegGpd>());
1015 : break;
1016 : }
1017 :
1018 : // 64-bit move.
1019 0 : if (srcSize == 8) {
1020 0 : instId = avxEnabled ? X86Inst::kIdVmovq : X86Inst::kIdMovq;
1021 : break;
1022 : }
1023 : }
1024 :
1025 0 : if (X86Reg::isVec(src) || src.isMem()) {
1026 0 : instId = avxEnabled ? X86Inst::kIdVmovaps : X86Inst::kIdMovaps;
1027 : uint32_t sign = X86Reg::signatureOfVecBySize(srcSize);
1028 :
1029 : dst.setSignature(sign);
1030 0 : if (src.isReg()) src.setSignature(sign);
1031 : break;
1032 : }
1033 : }
1034 :
1035 : return DebugUtils::errored(kErrorInvalidState);
1036 : }
1037 :
1038 0 : if (src.isMem())
1039 0 : src.as<X86Mem>().setSize(srcSize);
1040 :
1041 : emitter->setInlineComment(comment);
1042 0 : return emitter->emit(instId, dst, src);
1043 : }
1044 :
1045 : // ============================================================================
1046 : // [asmjit::X86Internal - Emit Prolog & Epilog]
1047 : // ============================================================================
1048 :
1049 1944 : ASMJIT_FAVOR_SIZE Error X86Internal::emitProlog(X86Emitter* emitter, const FuncFrameLayout& layout) {
1050 : uint32_t gpSaved = layout.getSavedRegs(X86Reg::kKindGp);
1051 :
1052 : X86Gp zsp = emitter->zsp(); // ESP|RSP register.
1053 : X86Gp zbp = emitter->zbp(); // EBP|RBP register.
1054 : X86Gp gpReg = emitter->zsp(); // General purpose register (temporary).
1055 : X86Gp saReg = emitter->zsp(); // Stack-arguments base register.
1056 :
1057 : // Emit: 'push zbp'
1058 : // 'mov zbp, zsp'.
1059 1944 : if (layout.hasPreservedFP()) {
1060 0 : gpSaved &= ~Utils::mask(X86Gp::kIdBp);
1061 0 : ASMJIT_PROPAGATE(emitter->push(zbp));
1062 0 : ASMJIT_PROPAGATE(emitter->mov(zbp, zsp));
1063 : }
1064 :
1065 : // Emit: 'push gp' sequence.
1066 1944 : if (gpSaved) {
1067 5408 : for (uint32_t i = gpSaved, regId = 0; i; i >>= 1, regId++) {
1068 4344 : if (!(i & 0x1)) continue;
1069 : gpReg.setId(regId);
1070 1108 : ASMJIT_PROPAGATE(emitter->push(gpReg));
1071 : }
1072 : }
1073 :
1074 : // Emit: 'mov saReg, zsp'.
1075 : uint32_t stackArgsRegId = layout.getStackArgsRegId();
1076 1944 : if (stackArgsRegId != Globals::kInvalidRegId && stackArgsRegId != X86Gp::kIdSp) {
1077 : saReg.setId(stackArgsRegId);
1078 0 : if (!(layout.hasPreservedFP() && stackArgsRegId == X86Gp::kIdBp))
1079 0 : ASMJIT_PROPAGATE(emitter->mov(saReg, zsp));
1080 : }
1081 :
1082 : // Emit: 'and zsp, StackAlignment'.
1083 1944 : if (layout.hasDynamicAlignment())
1084 0 : ASMJIT_PROPAGATE(emitter->and_(zsp, -static_cast<int32_t>(layout.getStackAlignment())));
1085 :
1086 : // Emit: 'sub zsp, StackAdjustment'.
1087 1944 : if (layout.hasStackAdjustment())
1088 932 : ASMJIT_PROPAGATE(emitter->sub(zsp, layout.getStackAdjustment()));
1089 :
1090 : // Emit: 'mov [zsp + dsaSlot], saReg'.
1091 1944 : if (layout.hasDynamicAlignment() && layout.hasDsaSlotUsed()) {
1092 0 : X86Mem saMem = x86::ptr(zsp, layout._dsaSlot);
1093 0 : ASMJIT_PROPAGATE(emitter->mov(saMem, saReg));
1094 : }
1095 :
1096 : // Emit 'movaps|movups [zsp + X], xmm0..15'.
1097 : uint32_t xmmSaved = layout.getSavedRegs(X86Reg::kKindVec);
1098 1944 : if (xmmSaved) {
1099 : X86Mem vecBase = x86::ptr(zsp, layout.getVecStackOffset());
1100 : X86Reg vecReg = x86::xmm(0);
1101 :
1102 : uint32_t vecInst = x86GetXmmMovInst(layout);
1103 : uint32_t vecSize = 16;
1104 :
1105 0 : for (uint32_t i = xmmSaved, regId = 0; i; i >>= 1, regId++) {
1106 0 : if (!(i & 0x1)) continue;
1107 : vecReg.setId(regId);
1108 0 : ASMJIT_PROPAGATE(emitter->emit(vecInst, vecBase, vecReg));
1109 : vecBase.addOffsetLo32(static_cast<int32_t>(vecSize));
1110 : }
1111 : }
1112 :
1113 : return kErrorOk;
1114 : }
1115 :
1116 1944 : ASMJIT_FAVOR_SIZE Error X86Internal::emitEpilog(X86Emitter* emitter, const FuncFrameLayout& layout) {
1117 : uint32_t i;
1118 : uint32_t regId;
1119 :
1120 : uint32_t gpSize = emitter->getGpSize();
1121 : uint32_t gpSaved = layout.getSavedRegs(X86Reg::kKindGp);
1122 :
1123 : X86Gp zsp = emitter->zsp(); // ESP|RSP register.
1124 : X86Gp zbp = emitter->zbp(); // EBP|RBP register.
1125 : X86Gp gpReg = emitter->zsp(); // General purpose register (temporary).
1126 :
1127 : // Don't emit 'pop zbp' in the pop sequence, this case is handled separately.
1128 1944 : if (layout.hasPreservedFP()) gpSaved &= ~Utils::mask(X86Gp::kIdBp);
1129 :
1130 : // Emit 'movaps|movups xmm0..15, [zsp + X]'.
1131 : uint32_t xmmSaved = layout.getSavedRegs(X86Reg::kKindVec);
1132 1944 : if (xmmSaved) {
1133 : X86Mem vecBase = x86::ptr(zsp, layout.getVecStackOffset());
1134 : X86Reg vecReg = x86::xmm(0);
1135 :
1136 : uint32_t vecInst = x86GetXmmMovInst(layout);
1137 : uint32_t vecSize = 16;
1138 :
1139 0 : for (i = xmmSaved, regId = 0; i; i >>= 1, regId++) {
1140 0 : if (!(i & 0x1)) continue;
1141 : vecReg.setId(regId);
1142 0 : ASMJIT_PROPAGATE(emitter->emit(vecInst, vecReg, vecBase));
1143 : vecBase.addOffsetLo32(static_cast<int32_t>(vecSize));
1144 : }
1145 : }
1146 :
1147 : // Emit 'emms' and 'vzeroupper'.
1148 1944 : if (layout.hasMmxCleanup()) ASMJIT_PROPAGATE(emitter->emms());
1149 1944 : if (layout.hasAvxCleanup()) ASMJIT_PROPAGATE(emitter->vzeroupper());
1150 :
1151 1944 : if (layout.hasPreservedFP()) {
1152 : // Emit 'mov zsp, zbp' or 'lea zsp, [zbp - x]'
1153 0 : int32_t count = static_cast<int32_t>(layout.getGpStackSize() - gpSize);
1154 0 : if (!count)
1155 0 : ASMJIT_PROPAGATE(emitter->mov(zsp, zbp));
1156 : else
1157 0 : ASMJIT_PROPAGATE(emitter->lea(zsp, x86::ptr(zbp, -count)));
1158 : }
1159 : else {
1160 1944 : if (layout.hasDynamicAlignment() && layout.hasDsaSlotUsed()) {
1161 : // Emit 'mov zsp, [zsp + DsaSlot]'.
1162 0 : X86Mem saMem = x86::ptr(zsp, layout._dsaSlot);
1163 0 : ASMJIT_PROPAGATE(emitter->mov(zsp, saMem));
1164 : }
1165 1944 : else if (layout.hasStackAdjustment()) {
1166 : // Emit 'add zsp, StackAdjustment'.
1167 932 : ASMJIT_PROPAGATE(emitter->add(zsp, static_cast<int32_t>(layout.getStackAdjustment())));
1168 : }
1169 : }
1170 :
1171 : // Emit 'pop gp' sequence.
1172 1944 : if (gpSaved) {
1173 : i = gpSaved;
1174 : regId = 16;
1175 :
1176 : do {
1177 17024 : regId--;
1178 17024 : if (i & 0x8000) {
1179 : gpReg.setId(regId);
1180 1108 : ASMJIT_PROPAGATE(emitter->pop(gpReg));
1181 : }
1182 17024 : i <<= 1;
1183 17024 : } while (regId != 0);
1184 : }
1185 :
1186 : // Emit 'pop zbp'.
1187 1944 : if (layout.hasPreservedFP()) ASMJIT_PROPAGATE(emitter->pop(zbp));
1188 :
1189 : // Emit 'ret' or 'ret x'.
1190 1944 : if (layout.hasCalleeStackCleanup())
1191 0 : ASMJIT_PROPAGATE(emitter->emit(X86Inst::kIdRet, static_cast<int>(layout.getCalleeStackCleanup())));
1192 : else
1193 1944 : ASMJIT_PROPAGATE(emitter->emit(X86Inst::kIdRet));
1194 :
1195 : return kErrorOk;
1196 : }
1197 :
1198 : // ============================================================================
1199 : // [asmjit::X86Internal - AllocArgs]
1200 : // ============================================================================
1201 :
1202 0 : ASMJIT_FAVOR_SIZE Error X86Internal::allocArgs(X86Emitter* emitter, const FuncFrameLayout& layout, const FuncArgsMapper& args) {
1203 : typedef X86FuncArgsContext::SrcArg SrcArg;
1204 : typedef X86FuncArgsContext::DstArg DstArg;
1205 : typedef X86FuncArgsContext::WorkData WorkData;
1206 : enum { kMaxVRegKinds = Globals::kMaxVRegKinds };
1207 :
1208 : uint32_t i;
1209 : const FuncDetail& func = *args.getFuncDetail();
1210 :
1211 0 : X86FuncArgsContext ctx;
1212 0 : ASMJIT_PROPAGATE(ctx.initWorkData(args, layout._savedRegs, layout.hasPreservedFP()));
1213 :
1214 : // We must honor AVX if it's enabled.
1215 : bool avxEnabled = layout.isAvxEnabled();
1216 :
1217 : // Free registers that can be used as temporaries and during shuffling.
1218 : // We initialize them to match all workRegs (registers that can be used
1219 : // by the function) except source regs, which are used to pass arguments.
1220 : // Free registers are changed during shuffling - when an argument is moved
1221 : // to the final register then the register itself is removed from freeRegs
1222 : // (it can't be altered anymore during shuffling).
1223 : uint32_t freeRegs[kMaxVRegKinds];
1224 0 : for (i = 0; i < kMaxVRegKinds; i++)
1225 0 : freeRegs[i] = ctx._workData[i].workRegs & ~ctx._workData[i].srcRegs;
1226 :
1227 : // This is an iterative process that runs until there is a work to do. When
1228 : // one register is moved it can create space for another move. Such moves can
1229 : // depend on each other so the algorithm may run multiple times before all
1230 : // arguments are in place. This part does only register-to-register work,
1231 : // arguments moved from stack-to-register area handled later.
1232 : for (;;) {
1233 : bool hasWork = false; // Do we have a work to do?
1234 : bool didWork = false; // If we did something...
1235 :
1236 : uint32_t dstRegKind = kMaxVRegKinds;
1237 : do {
1238 0 : WorkData& wd = ctx._workData[--dstRegKind];
1239 0 : if (wd.numOps > wd.numStackArgs) {
1240 : hasWork = true;
1241 :
1242 : // Iterate over all destination regs and check if we can do something.
1243 : // We always go from destination to source, never the opposite.
1244 0 : uint32_t regsToDo = wd.dstRegs;
1245 : do {
1246 : // If there is a work to do there has to be at least one dstReg.
1247 : ASMJIT_ASSERT(regsToDo != 0);
1248 : uint32_t dstRegId = Utils::findFirstBit(regsToDo);
1249 : uint32_t dstRegMask = Utils::mask(dstRegId);
1250 :
1251 0 : uint32_t argIndex = wd.argIndex[dstRegId];
1252 0 : const DstArg& dstArg = args.getArg(argIndex);
1253 : const SrcArg& srcArg = func.getArg(argIndex);
1254 :
1255 0 : if (srcArg.byReg()) {
1256 0 : uint32_t srcRegType = srcArg.getRegType();
1257 : uint32_t srcRegKind = X86Reg::kindOf(srcRegType);
1258 :
1259 0 : if (freeRegs[dstRegKind] & dstRegMask) {
1260 : X86Reg dstReg(X86Reg::fromTypeAndId(dstArg.getRegType(), dstRegId));
1261 : X86Reg srcReg(X86Reg::fromTypeAndId(srcRegType, srcArg.getRegId()));
1262 :
1263 0 : ASMJIT_PROPAGATE(
1264 : emitArgMove(emitter,
1265 : dstReg, dstArg.getTypeId(),
1266 : srcReg, srcArg.getTypeId(), avxEnabled));
1267 0 : freeRegs[dstRegKind] ^= dstRegMask; // Make the DST reg occupied.
1268 0 : freeRegs[srcRegKind] |= Utils::mask(srcArg.getRegId()); // Make the SRC reg free.
1269 :
1270 : ASMJIT_ASSERT(wd.numOps >= 1);
1271 0 : wd.numOps--;
1272 : didWork = true;
1273 : }
1274 : else {
1275 : // Check if this is a swap operation.
1276 0 : if (dstRegKind == srcRegKind) {
1277 : uint32_t srcRegId = srcArg.getRegId();
1278 :
1279 0 : uint32_t otherIndex = wd.argIndex[srcRegId];
1280 0 : const DstArg& otherArg = args.getArg(otherIndex);
1281 :
1282 0 : if (otherArg.getRegId() == srcRegId && X86Reg::kindOf(otherArg.getRegType()) == dstRegKind) {
1283 : // If this is GP reg it can be handled by 'xchg'.
1284 0 : if (dstRegKind == X86Reg::kKindGp) {
1285 0 : uint32_t highestType = std::max(dstArg.getRegType(), srcRegType);
1286 :
1287 0 : X86Reg dstReg = x86::gpd(dstRegId);
1288 0 : X86Reg srcReg = x86::gpd(srcRegId);
1289 :
1290 0 : if (highestType == X86Reg::kRegGpq) {
1291 : dstReg.setSignature(X86RegTraits<X86Reg::kRegGpq>::kSignature);
1292 : srcReg.setSignature(X86RegTraits<X86Reg::kRegGpq>::kSignature);
1293 : }
1294 0 : ASMJIT_PROPAGATE(emitter->emit(X86Inst::kIdXchg, dstReg, srcReg));
1295 0 : regsToDo &= ~Utils::mask(srcRegId);
1296 0 : freeRegs[dstRegKind] &= ~(Utils::mask(srcRegId) | dstRegMask);
1297 :
1298 : ASMJIT_ASSERT(wd.numOps >= 2);
1299 : ASMJIT_ASSERT(wd.numSwaps >= 1);
1300 0 : wd.numOps-=2;
1301 0 : wd.numSwaps--;
1302 : didWork = true;
1303 : }
1304 : }
1305 : }
1306 : }
1307 : }
1308 :
1309 : // Clear the reg in `regsToDo` and continue if there are more.
1310 0 : regsToDo ^= dstRegMask;
1311 0 : } while (regsToDo);
1312 : }
1313 0 : } while (dstRegKind);
1314 :
1315 0 : if (!hasWork)
1316 : break;
1317 :
1318 0 : if (!didWork)
1319 : return DebugUtils::errored(kErrorInvalidState);
1320 : }
1321 :
1322 : // Load arguments passed by stack into registers. This is pretty simple and
1323 : // it never requires multiple iterations like the previous phase.
1324 0 : if (ctx._hasStackArgs) {
1325 : // Base address of all arguments passed by stack.
1326 : X86Mem saBase = x86::ptr(emitter->gpz(layout.getStackArgsRegId()), layout.getStackArgsOffset());
1327 :
1328 : uint32_t dstRegKind = kMaxVRegKinds;
1329 : do {
1330 0 : WorkData& wd = ctx._workData[--dstRegKind];
1331 0 : if (wd.numStackArgs) {
1332 : // Iterate over all destination regs and check if we can do something.
1333 : // We always go from destination to source, never the opposite.
1334 0 : uint32_t regsToDo = wd.dstRegs;
1335 : do {
1336 : // If there is a work to do there has to be at least one dstReg.
1337 : ASMJIT_ASSERT(regsToDo != 0);
1338 : ASMJIT_ASSERT(wd.numOps > 0);
1339 :
1340 : uint32_t dstRegId = Utils::findFirstBit(regsToDo);
1341 : uint32_t dstRegMask = Utils::mask(dstRegId);
1342 :
1343 0 : uint32_t argIndex = wd.argIndex[dstRegId];
1344 0 : const DstArg& dstArg = args.getArg(argIndex);
1345 : const SrcArg& srcArg = func.getArg(argIndex);
1346 :
1347 : // Only arguments passed by stack should remain, also the destination
1348 : // registers must be free now (otherwise the first part of the algorithm
1349 : // failed). Ideally this should be assert, but it's much safer to enforce
1350 : // this in release as well.
1351 0 : if (!srcArg.byStack() || !(freeRegs[dstRegKind] & dstRegMask))
1352 0 : return DebugUtils::errored(kErrorInvalidState);
1353 :
1354 : X86Reg dstReg = X86Reg::fromTypeAndId(dstArg.getRegType(), dstRegId);
1355 : X86Mem srcMem = saBase.adjusted(srcArg.getStackOffset());
1356 :
1357 0 : ASMJIT_PROPAGATE(
1358 : emitArgMove(emitter,
1359 : dstReg, dstArg.getTypeId(),
1360 : srcMem, srcArg.getTypeId(), avxEnabled));
1361 :
1362 0 : freeRegs[dstRegKind] ^= dstRegMask;
1363 0 : regsToDo ^= dstRegMask;
1364 0 : wd.numOps--;
1365 0 : } while (regsToDo);
1366 : }
1367 0 : } while (dstRegKind);
1368 : }
1369 :
1370 : return kErrorOk;
1371 : }
1372 :
1373 : } // asmjit namespace
1374 : } // namespace PLMD
1375 :
1376 : // [Api-End]
1377 : #include "./asmjit_apiend.h"
1378 :
1379 : // [Guard]
1380 : #endif // ASMJIT_BUILD_X86
1381 : #pragma GCC diagnostic pop
1382 : #endif // __PLUMED_HAS_ASMJIT
|