1 
2 /*---------------------------------------------------------------*/
3 /*--- begin                                 host_amd64_defs.c ---*/
4 /*---------------------------------------------------------------*/
5 
6 /*
7    This file is part of Valgrind, a dynamic binary instrumentation
8    framework.
9 
10    Copyright (C) 2004-2013 OpenWorks LLP
11       info@open-works.net
12 
13    This program is free software; you can redistribute it and/or
14    modify it under the terms of the GNU General Public License as
15    published by the Free Software Foundation; either version 2 of the
16    License, or (at your option) any later version.
17 
18    This program is distributed in the hope that it will be useful, but
19    WITHOUT ANY WARRANTY; without even the implied warranty of
20    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
21    General Public License for more details.
22 
23    You should have received a copy of the GNU General Public License
24    along with this program; if not, write to the Free Software
25    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
26    02110-1301, USA.
27 
28    The GNU General Public License is contained in the file COPYING.
29 
30    Neither the names of the U.S. Department of Energy nor the
31    University of California nor the names of its contributors may be
32    used to endorse or promote products derived from this software
33    without prior written permission.
34 */
35 
36 #include "libvex_basictypes.h"
37 #include "libvex.h"
38 #include "libvex_trc_values.h"
39 
40 #include "main_util.h"
41 #include "host_generic_regs.h"
42 #include "host_amd64_defs.h"
43 
44 
45 /* --------- Registers. --------- */
46 
getRRegUniverse_AMD64(void)47 const RRegUniverse* getRRegUniverse_AMD64 ( void )
48 {
49    /* The real-register universe is a big constant, so we just want to
50       initialise it once. */
51    static RRegUniverse rRegUniverse_AMD64;
52    static Bool         rRegUniverse_AMD64_initted = False;
53 
54    /* Handy shorthand, nothing more */
55    RRegUniverse* ru = &rRegUniverse_AMD64;
56 
57    /* This isn't thread-safe.  Sigh. */
58    if (LIKELY(rRegUniverse_AMD64_initted))
59       return ru;
60 
61    RRegUniverse__init(ru);
62 
63    /* Add the registers.  The initial segment of this array must be
64       those available for allocation by reg-alloc, and those that
65       follow are not available for allocation. */
66    ru->regs[ru->size++] = hregAMD64_RSI();
67    ru->regs[ru->size++] = hregAMD64_RDI();
68    ru->regs[ru->size++] = hregAMD64_R8();
69    ru->regs[ru->size++] = hregAMD64_R9();
70    ru->regs[ru->size++] = hregAMD64_R12();
71    ru->regs[ru->size++] = hregAMD64_R13();
72    ru->regs[ru->size++] = hregAMD64_R14();
73    ru->regs[ru->size++] = hregAMD64_R15();
74    ru->regs[ru->size++] = hregAMD64_RBX();
75    ru->regs[ru->size++] = hregAMD64_XMM3();
76    ru->regs[ru->size++] = hregAMD64_XMM4();
77    ru->regs[ru->size++] = hregAMD64_XMM5();
78    ru->regs[ru->size++] = hregAMD64_XMM6();
79    ru->regs[ru->size++] = hregAMD64_XMM7();
80    ru->regs[ru->size++] = hregAMD64_XMM8();
81    ru->regs[ru->size++] = hregAMD64_XMM9();
82    ru->regs[ru->size++] = hregAMD64_XMM10();
83    ru->regs[ru->size++] = hregAMD64_XMM11();
84    ru->regs[ru->size++] = hregAMD64_XMM12();
85    ru->regs[ru->size++] = hregAMD64_R10();
86    ru->allocable = ru->size;
87    /* And other regs, not available to the allocator. */
88    ru->regs[ru->size++] = hregAMD64_RAX();
89    ru->regs[ru->size++] = hregAMD64_RCX();
90    ru->regs[ru->size++] = hregAMD64_RDX();
91    ru->regs[ru->size++] = hregAMD64_RSP();
92    ru->regs[ru->size++] = hregAMD64_RBP();
93    ru->regs[ru->size++] = hregAMD64_R11();
94    ru->regs[ru->size++] = hregAMD64_XMM0();
95    ru->regs[ru->size++] = hregAMD64_XMM1();
96 
97    rRegUniverse_AMD64_initted = True;
98 
99    RRegUniverse__check_is_sane(ru);
100    return ru;
101 }
102 
103 
ppHRegAMD64(HReg reg)104 void ppHRegAMD64 ( HReg reg )
105 {
106    Int r;
107    static const HChar* ireg64_names[16]
108      = { "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi",
109          "%r8",  "%r9",  "%r10", "%r11", "%r12", "%r13", "%r14", "%r15" };
110    /* Be generic for all virtual regs. */
111    if (hregIsVirtual(reg)) {
112       ppHReg(reg);
113       return;
114    }
115    /* But specific for real regs. */
116    switch (hregClass(reg)) {
117       case HRcInt64:
118          r = hregEncoding(reg);
119          vassert(r >= 0 && r < 16);
120          vex_printf("%s", ireg64_names[r]);
121          return;
122       case HRcVec128:
123          r = hregEncoding(reg);
124          vassert(r >= 0 && r < 16);
125          vex_printf("%%xmm%d", r);
126          return;
127       default:
128          vpanic("ppHRegAMD64");
129    }
130 }
131 
ppHRegAMD64_lo32(HReg reg)132 static void ppHRegAMD64_lo32 ( HReg reg )
133 {
134    Int r;
135    static const HChar* ireg32_names[16]
136      = { "%eax", "%ecx", "%edx",  "%ebx",  "%esp",  "%ebp",  "%esi",  "%edi",
137          "%r8d", "%r9d", "%r10d", "%r11d", "%r12d", "%r13d", "%r14d", "%r15d" };
138    /* Be generic for all virtual regs. */
139    if (hregIsVirtual(reg)) {
140       ppHReg(reg);
141       vex_printf("d");
142       return;
143    }
144    /* But specific for real regs. */
145    switch (hregClass(reg)) {
146       case HRcInt64:
147          r = hregEncoding(reg);
148          vassert(r >= 0 && r < 16);
149          vex_printf("%s", ireg32_names[r]);
150          return;
151       default:
152          vpanic("ppHRegAMD64_lo32: invalid regclass");
153    }
154 }
155 
156 
157 /* --------- Condition codes, Intel encoding. --------- */
158 
showAMD64CondCode(AMD64CondCode cond)159 const HChar* showAMD64CondCode ( AMD64CondCode cond )
160 {
161    switch (cond) {
162       case Acc_O:      return "o";
163       case Acc_NO:     return "no";
164       case Acc_B:      return "b";
165       case Acc_NB:     return "nb";
166       case Acc_Z:      return "z";
167       case Acc_NZ:     return "nz";
168       case Acc_BE:     return "be";
169       case Acc_NBE:    return "nbe";
170       case Acc_S:      return "s";
171       case Acc_NS:     return "ns";
172       case Acc_P:      return "p";
173       case Acc_NP:     return "np";
174       case Acc_L:      return "l";
175       case Acc_NL:     return "nl";
176       case Acc_LE:     return "le";
177       case Acc_NLE:    return "nle";
178       case Acc_ALWAYS: return "ALWAYS";
179       default: vpanic("ppAMD64CondCode");
180    }
181 }
182 
183 
184 /* --------- AMD64AMode: memory address expressions. --------- */
185 
AMD64AMode_IR(UInt imm32,HReg reg)186 AMD64AMode* AMD64AMode_IR ( UInt imm32, HReg reg ) {
187    AMD64AMode* am = LibVEX_Alloc_inline(sizeof(AMD64AMode));
188    am->tag        = Aam_IR;
189    am->Aam.IR.imm = imm32;
190    am->Aam.IR.reg = reg;
191    return am;
192 }
AMD64AMode_IRRS(UInt imm32,HReg base,HReg indEx,Int shift)193 AMD64AMode* AMD64AMode_IRRS ( UInt imm32, HReg base, HReg indEx, Int shift ) {
194    AMD64AMode* am = LibVEX_Alloc_inline(sizeof(AMD64AMode));
195    am->tag = Aam_IRRS;
196    am->Aam.IRRS.imm   = imm32;
197    am->Aam.IRRS.base  = base;
198    am->Aam.IRRS.index = indEx;
199    am->Aam.IRRS.shift = shift;
200    vassert(shift >= 0 && shift <= 3);
201    return am;
202 }
203 
ppAMD64AMode(AMD64AMode * am)204 void ppAMD64AMode ( AMD64AMode* am ) {
205    switch (am->tag) {
206       case Aam_IR:
207          if (am->Aam.IR.imm == 0)
208             vex_printf("(");
209          else
210             vex_printf("0x%x(", am->Aam.IR.imm);
211          ppHRegAMD64(am->Aam.IR.reg);
212          vex_printf(")");
213          return;
214       case Aam_IRRS:
215          vex_printf("0x%x(", am->Aam.IRRS.imm);
216          ppHRegAMD64(am->Aam.IRRS.base);
217          vex_printf(",");
218          ppHRegAMD64(am->Aam.IRRS.index);
219          vex_printf(",%d)", 1 << am->Aam.IRRS.shift);
220          return;
221       default:
222          vpanic("ppAMD64AMode");
223    }
224 }
225 
addRegUsage_AMD64AMode(HRegUsage * u,AMD64AMode * am)226 static void addRegUsage_AMD64AMode ( HRegUsage* u, AMD64AMode* am ) {
227    switch (am->tag) {
228       case Aam_IR:
229          addHRegUse(u, HRmRead, am->Aam.IR.reg);
230          return;
231       case Aam_IRRS:
232          addHRegUse(u, HRmRead, am->Aam.IRRS.base);
233          addHRegUse(u, HRmRead, am->Aam.IRRS.index);
234          return;
235       default:
236          vpanic("addRegUsage_AMD64AMode");
237    }
238 }
239 
mapRegs_AMD64AMode(HRegRemap * m,AMD64AMode * am)240 static void mapRegs_AMD64AMode ( HRegRemap* m, AMD64AMode* am ) {
241    switch (am->tag) {
242       case Aam_IR:
243          am->Aam.IR.reg = lookupHRegRemap(m, am->Aam.IR.reg);
244          return;
245       case Aam_IRRS:
246          am->Aam.IRRS.base = lookupHRegRemap(m, am->Aam.IRRS.base);
247          am->Aam.IRRS.index = lookupHRegRemap(m, am->Aam.IRRS.index);
248          return;
249       default:
250          vpanic("mapRegs_AMD64AMode");
251    }
252 }
253 
254 /* --------- Operand, which can be reg, immediate or memory. --------- */
255 
AMD64RMI_Imm(UInt imm32)256 AMD64RMI* AMD64RMI_Imm ( UInt imm32 ) {
257    AMD64RMI* op       = LibVEX_Alloc_inline(sizeof(AMD64RMI));
258    op->tag            = Armi_Imm;
259    op->Armi.Imm.imm32 = imm32;
260    return op;
261 }
AMD64RMI_Reg(HReg reg)262 AMD64RMI* AMD64RMI_Reg ( HReg reg ) {
263    AMD64RMI* op     = LibVEX_Alloc_inline(sizeof(AMD64RMI));
264    op->tag          = Armi_Reg;
265    op->Armi.Reg.reg = reg;
266    return op;
267 }
AMD64RMI_Mem(AMD64AMode * am)268 AMD64RMI* AMD64RMI_Mem ( AMD64AMode* am ) {
269    AMD64RMI* op    = LibVEX_Alloc_inline(sizeof(AMD64RMI));
270    op->tag         = Armi_Mem;
271    op->Armi.Mem.am = am;
272    return op;
273 }
274 
ppAMD64RMI_wrk(AMD64RMI * op,Bool lo32)275 static void ppAMD64RMI_wrk ( AMD64RMI* op, Bool lo32 ) {
276    switch (op->tag) {
277       case Armi_Imm:
278          vex_printf("$0x%x", op->Armi.Imm.imm32);
279          return;
280       case Armi_Reg:
281          if (lo32)
282             ppHRegAMD64_lo32(op->Armi.Reg.reg);
283          else
284             ppHRegAMD64(op->Armi.Reg.reg);
285          return;
286       case Armi_Mem:
287          ppAMD64AMode(op->Armi.Mem.am);
288          return;
289      default:
290          vpanic("ppAMD64RMI");
291    }
292 }
ppAMD64RMI(AMD64RMI * op)293 void ppAMD64RMI ( AMD64RMI* op ) {
294    ppAMD64RMI_wrk(op, False/*!lo32*/);
295 }
ppAMD64RMI_lo32(AMD64RMI * op)296 void ppAMD64RMI_lo32 ( AMD64RMI* op ) {
297    ppAMD64RMI_wrk(op, True/*lo32*/);
298 }
299 
300 /* An AMD64RMI can only be used in a "read" context (what would it mean
301    to write or modify a literal?) and so we enumerate its registers
302    accordingly. */
addRegUsage_AMD64RMI(HRegUsage * u,AMD64RMI * op)303 static void addRegUsage_AMD64RMI ( HRegUsage* u, AMD64RMI* op ) {
304    switch (op->tag) {
305       case Armi_Imm:
306          return;
307       case Armi_Reg:
308          addHRegUse(u, HRmRead, op->Armi.Reg.reg);
309          return;
310       case Armi_Mem:
311          addRegUsage_AMD64AMode(u, op->Armi.Mem.am);
312          return;
313       default:
314          vpanic("addRegUsage_AMD64RMI");
315    }
316 }
317 
mapRegs_AMD64RMI(HRegRemap * m,AMD64RMI * op)318 static void mapRegs_AMD64RMI ( HRegRemap* m, AMD64RMI* op ) {
319    switch (op->tag) {
320       case Armi_Imm:
321          return;
322       case Armi_Reg:
323          op->Armi.Reg.reg = lookupHRegRemap(m, op->Armi.Reg.reg);
324          return;
325       case Armi_Mem:
326          mapRegs_AMD64AMode(m, op->Armi.Mem.am);
327          return;
328       default:
329          vpanic("mapRegs_AMD64RMI");
330    }
331 }
332 
333 
334 /* --------- Operand, which can be reg or immediate only. --------- */
335 
AMD64RI_Imm(UInt imm32)336 AMD64RI* AMD64RI_Imm ( UInt imm32 ) {
337    AMD64RI* op       = LibVEX_Alloc_inline(sizeof(AMD64RI));
338    op->tag           = Ari_Imm;
339    op->Ari.Imm.imm32 = imm32;
340    return op;
341 }
AMD64RI_Reg(HReg reg)342 AMD64RI* AMD64RI_Reg ( HReg reg ) {
343    AMD64RI* op     = LibVEX_Alloc_inline(sizeof(AMD64RI));
344    op->tag         = Ari_Reg;
345    op->Ari.Reg.reg = reg;
346    return op;
347 }
348 
ppAMD64RI(AMD64RI * op)349 void ppAMD64RI ( AMD64RI* op ) {
350    switch (op->tag) {
351       case Ari_Imm:
352          vex_printf("$0x%x", op->Ari.Imm.imm32);
353          return;
354       case Ari_Reg:
355          ppHRegAMD64(op->Ari.Reg.reg);
356          return;
357      default:
358          vpanic("ppAMD64RI");
359    }
360 }
361 
362 /* An AMD64RI can only be used in a "read" context (what would it mean
363    to write or modify a literal?) and so we enumerate its registers
364    accordingly. */
addRegUsage_AMD64RI(HRegUsage * u,AMD64RI * op)365 static void addRegUsage_AMD64RI ( HRegUsage* u, AMD64RI* op ) {
366    switch (op->tag) {
367       case Ari_Imm:
368          return;
369       case Ari_Reg:
370          addHRegUse(u, HRmRead, op->Ari.Reg.reg);
371          return;
372       default:
373          vpanic("addRegUsage_AMD64RI");
374    }
375 }
376 
mapRegs_AMD64RI(HRegRemap * m,AMD64RI * op)377 static void mapRegs_AMD64RI ( HRegRemap* m, AMD64RI* op ) {
378    switch (op->tag) {
379       case Ari_Imm:
380          return;
381       case Ari_Reg:
382          op->Ari.Reg.reg = lookupHRegRemap(m, op->Ari.Reg.reg);
383          return;
384       default:
385          vpanic("mapRegs_AMD64RI");
386    }
387 }
388 
389 
390 /* --------- Operand, which can be reg or memory only. --------- */
391 
AMD64RM_Reg(HReg reg)392 AMD64RM* AMD64RM_Reg ( HReg reg ) {
393    AMD64RM* op       = LibVEX_Alloc_inline(sizeof(AMD64RM));
394    op->tag         = Arm_Reg;
395    op->Arm.Reg.reg = reg;
396    return op;
397 }
AMD64RM_Mem(AMD64AMode * am)398 AMD64RM* AMD64RM_Mem ( AMD64AMode* am ) {
399    AMD64RM* op    = LibVEX_Alloc_inline(sizeof(AMD64RM));
400    op->tag        = Arm_Mem;
401    op->Arm.Mem.am = am;
402    return op;
403 }
404 
ppAMD64RM(AMD64RM * op)405 void ppAMD64RM ( AMD64RM* op ) {
406    switch (op->tag) {
407       case Arm_Mem:
408          ppAMD64AMode(op->Arm.Mem.am);
409          return;
410       case Arm_Reg:
411          ppHRegAMD64(op->Arm.Reg.reg);
412          return;
413      default:
414          vpanic("ppAMD64RM");
415    }
416 }
417 
418 /* Because an AMD64RM can be both a source or destination operand, we
419    have to supply a mode -- pertaining to the operand as a whole --
420    indicating how it's being used. */
addRegUsage_AMD64RM(HRegUsage * u,AMD64RM * op,HRegMode mode)421 static void addRegUsage_AMD64RM ( HRegUsage* u, AMD64RM* op, HRegMode mode ) {
422    switch (op->tag) {
423       case Arm_Mem:
424          /* Memory is read, written or modified.  So we just want to
425             know the regs read by the amode. */
426          addRegUsage_AMD64AMode(u, op->Arm.Mem.am);
427          return;
428       case Arm_Reg:
429          /* reg is read, written or modified.  Add it in the
430             appropriate way. */
431          addHRegUse(u, mode, op->Arm.Reg.reg);
432          return;
433      default:
434          vpanic("addRegUsage_AMD64RM");
435    }
436 }
437 
mapRegs_AMD64RM(HRegRemap * m,AMD64RM * op)438 static void mapRegs_AMD64RM ( HRegRemap* m, AMD64RM* op )
439 {
440    switch (op->tag) {
441       case Arm_Mem:
442          mapRegs_AMD64AMode(m, op->Arm.Mem.am);
443          return;
444       case Arm_Reg:
445          op->Arm.Reg.reg = lookupHRegRemap(m, op->Arm.Reg.reg);
446          return;
447      default:
448          vpanic("mapRegs_AMD64RM");
449    }
450 }
451 
452 
453 /* --------- Instructions. --------- */
454 
showAMD64ScalarSz(Int sz)455 static const HChar* showAMD64ScalarSz ( Int sz ) {
456    switch (sz) {
457       case 2: return "w";
458       case 4: return "l";
459       case 8: return "q";
460       default: vpanic("showAMD64ScalarSz");
461    }
462 }
463 
showAMD64UnaryOp(AMD64UnaryOp op)464 const HChar* showAMD64UnaryOp ( AMD64UnaryOp op ) {
465    switch (op) {
466       case Aun_NOT: return "not";
467       case Aun_NEG: return "neg";
468       default: vpanic("showAMD64UnaryOp");
469    }
470 }
471 
showAMD64AluOp(AMD64AluOp op)472 const HChar* showAMD64AluOp ( AMD64AluOp op ) {
473    switch (op) {
474       case Aalu_MOV:  return "mov";
475       case Aalu_CMP:  return "cmp";
476       case Aalu_ADD:  return "add";
477       case Aalu_SUB:  return "sub";
478       case Aalu_ADC:  return "adc";
479       case Aalu_SBB:  return "sbb";
480       case Aalu_AND:  return "and";
481       case Aalu_OR:   return "or";
482       case Aalu_XOR:  return "xor";
483       case Aalu_MUL:  return "imul";
484       default: vpanic("showAMD64AluOp");
485    }
486 }
487 
showAMD64ShiftOp(AMD64ShiftOp op)488 const HChar* showAMD64ShiftOp ( AMD64ShiftOp op ) {
489    switch (op) {
490       case Ash_SHL: return "shl";
491       case Ash_SHR: return "shr";
492       case Ash_SAR: return "sar";
493       default: vpanic("showAMD64ShiftOp");
494    }
495 }
496 
showA87FpOp(A87FpOp op)497 const HChar* showA87FpOp ( A87FpOp op ) {
498    switch (op) {
499       case Afp_SCALE:  return "scale";
500       case Afp_ATAN:   return "atan";
501       case Afp_YL2X:   return "yl2x";
502       case Afp_YL2XP1: return "yl2xp1";
503       case Afp_PREM:   return "prem";
504       case Afp_PREM1:  return "prem1";
505       case Afp_SQRT:   return "sqrt";
506       case Afp_SIN:    return "sin";
507       case Afp_COS:    return "cos";
508       case Afp_TAN:    return "tan";
509       case Afp_ROUND:  return "round";
510       case Afp_2XM1:   return "2xm1";
511       default: vpanic("showA87FpOp");
512    }
513 }
514 
showAMD64SseOp(AMD64SseOp op)515 const HChar* showAMD64SseOp ( AMD64SseOp op ) {
516    switch (op) {
517       case Asse_MOV:      return "movups";
518       case Asse_ADDF:     return "add";
519       case Asse_SUBF:     return "sub";
520       case Asse_MULF:     return "mul";
521       case Asse_DIVF:     return "div";
522       case Asse_MAXF:     return "max";
523       case Asse_MINF:     return "min";
524       case Asse_CMPEQF:   return "cmpFeq";
525       case Asse_CMPLTF:   return "cmpFlt";
526       case Asse_CMPLEF:   return "cmpFle";
527       case Asse_CMPUNF:   return "cmpFun";
528       case Asse_RCPF:     return "rcp";
529       case Asse_RSQRTF:   return "rsqrt";
530       case Asse_SQRTF:    return "sqrt";
531       case Asse_AND:      return "and";
532       case Asse_OR:       return "or";
533       case Asse_XOR:      return "xor";
534       case Asse_ANDN:     return "andn";
535       case Asse_ADD8:     return "paddb";
536       case Asse_ADD16:    return "paddw";
537       case Asse_ADD32:    return "paddd";
538       case Asse_ADD64:    return "paddq";
539       case Asse_QADD8U:   return "paddusb";
540       case Asse_QADD16U:  return "paddusw";
541       case Asse_QADD8S:   return "paddsb";
542       case Asse_QADD16S:  return "paddsw";
543       case Asse_SUB8:     return "psubb";
544       case Asse_SUB16:    return "psubw";
545       case Asse_SUB32:    return "psubd";
546       case Asse_SUB64:    return "psubq";
547       case Asse_QSUB8U:   return "psubusb";
548       case Asse_QSUB16U:  return "psubusw";
549       case Asse_QSUB8S:   return "psubsb";
550       case Asse_QSUB16S:  return "psubsw";
551       case Asse_MUL16:    return "pmullw";
552       case Asse_MULHI16U: return "pmulhuw";
553       case Asse_MULHI16S: return "pmulhw";
554       case Asse_AVG8U:    return "pavgb";
555       case Asse_AVG16U:   return "pavgw";
556       case Asse_MAX16S:   return "pmaxw";
557       case Asse_MAX8U:    return "pmaxub";
558       case Asse_MIN16S:   return "pminw";
559       case Asse_MIN8U:    return "pminub";
560       case Asse_CMPEQ8:   return "pcmpeqb";
561       case Asse_CMPEQ16:  return "pcmpeqw";
562       case Asse_CMPEQ32:  return "pcmpeqd";
563       case Asse_CMPGT8S:  return "pcmpgtb";
564       case Asse_CMPGT16S: return "pcmpgtw";
565       case Asse_CMPGT32S: return "pcmpgtd";
566       case Asse_SHL16:    return "psllw";
567       case Asse_SHL32:    return "pslld";
568       case Asse_SHL64:    return "psllq";
569       case Asse_SHR16:    return "psrlw";
570       case Asse_SHR32:    return "psrld";
571       case Asse_SHR64:    return "psrlq";
572       case Asse_SAR16:    return "psraw";
573       case Asse_SAR32:    return "psrad";
574       case Asse_PACKSSD:  return "packssdw";
575       case Asse_PACKSSW:  return "packsswb";
576       case Asse_PACKUSW:  return "packuswb";
577       case Asse_UNPCKHB:  return "punpckhb";
578       case Asse_UNPCKHW:  return "punpckhw";
579       case Asse_UNPCKHD:  return "punpckhd";
580       case Asse_UNPCKHQ:  return "punpckhq";
581       case Asse_UNPCKLB:  return "punpcklb";
582       case Asse_UNPCKLW:  return "punpcklw";
583       case Asse_UNPCKLD:  return "punpckld";
584       case Asse_UNPCKLQ:  return "punpcklq";
585       default: vpanic("showAMD64SseOp");
586    }
587 }
588 
AMD64Instr_Imm64(ULong imm64,HReg dst)589 AMD64Instr* AMD64Instr_Imm64 ( ULong imm64, HReg dst ) {
590    AMD64Instr* i      = LibVEX_Alloc_inline(sizeof(AMD64Instr));
591    i->tag             = Ain_Imm64;
592    i->Ain.Imm64.imm64 = imm64;
593    i->Ain.Imm64.dst   = dst;
594    return i;
595 }
AMD64Instr_Alu64R(AMD64AluOp op,AMD64RMI * src,HReg dst)596 AMD64Instr* AMD64Instr_Alu64R ( AMD64AluOp op, AMD64RMI* src, HReg dst ) {
597    AMD64Instr* i     = LibVEX_Alloc_inline(sizeof(AMD64Instr));
598    i->tag            = Ain_Alu64R;
599    i->Ain.Alu64R.op  = op;
600    i->Ain.Alu64R.src = src;
601    i->Ain.Alu64R.dst = dst;
602    return i;
603 }
AMD64Instr_Alu64M(AMD64AluOp op,AMD64RI * src,AMD64AMode * dst)604 AMD64Instr* AMD64Instr_Alu64M ( AMD64AluOp op, AMD64RI* src, AMD64AMode* dst ) {
605    AMD64Instr* i     = LibVEX_Alloc_inline(sizeof(AMD64Instr));
606    i->tag            = Ain_Alu64M;
607    i->Ain.Alu64M.op  = op;
608    i->Ain.Alu64M.src = src;
609    i->Ain.Alu64M.dst = dst;
610    vassert(op != Aalu_MUL);
611    return i;
612 }
AMD64Instr_Sh64(AMD64ShiftOp op,UInt src,HReg dst)613 AMD64Instr* AMD64Instr_Sh64 ( AMD64ShiftOp op, UInt src, HReg dst ) {
614    AMD64Instr* i   = LibVEX_Alloc_inline(sizeof(AMD64Instr));
615    i->tag          = Ain_Sh64;
616    i->Ain.Sh64.op  = op;
617    i->Ain.Sh64.src = src;
618    i->Ain.Sh64.dst = dst;
619    return i;
620 }
AMD64Instr_Test64(UInt imm32,HReg dst)621 AMD64Instr* AMD64Instr_Test64 ( UInt imm32, HReg dst ) {
622    AMD64Instr* i       = LibVEX_Alloc_inline(sizeof(AMD64Instr));
623    i->tag              = Ain_Test64;
624    i->Ain.Test64.imm32 = imm32;
625    i->Ain.Test64.dst   = dst;
626    return i;
627 }
AMD64Instr_Unary64(AMD64UnaryOp op,HReg dst)628 AMD64Instr* AMD64Instr_Unary64 ( AMD64UnaryOp op, HReg dst ) {
629    AMD64Instr* i      = LibVEX_Alloc_inline(sizeof(AMD64Instr));
630    i->tag             = Ain_Unary64;
631    i->Ain.Unary64.op  = op;
632    i->Ain.Unary64.dst = dst;
633    return i;
634 }
AMD64Instr_Lea64(AMD64AMode * am,HReg dst)635 AMD64Instr* AMD64Instr_Lea64 ( AMD64AMode* am, HReg dst ) {
636    AMD64Instr* i      = LibVEX_Alloc_inline(sizeof(AMD64Instr));
637    i->tag             = Ain_Lea64;
638    i->Ain.Lea64.am    = am;
639    i->Ain.Lea64.dst   = dst;
640    return i;
641 }
AMD64Instr_Alu32R(AMD64AluOp op,AMD64RMI * src,HReg dst)642 AMD64Instr* AMD64Instr_Alu32R ( AMD64AluOp op, AMD64RMI* src, HReg dst ) {
643    AMD64Instr* i     = LibVEX_Alloc_inline(sizeof(AMD64Instr));
644    i->tag            = Ain_Alu32R;
645    i->Ain.Alu32R.op  = op;
646    i->Ain.Alu32R.src = src;
647    i->Ain.Alu32R.dst = dst;
648    switch (op) {
649       case Aalu_ADD: case Aalu_SUB: case Aalu_CMP:
650       case Aalu_AND: case Aalu_OR:  case Aalu_XOR: break;
651       default: vassert(0);
652    }
653    return i;
654 }
AMD64Instr_MulL(Bool syned,AMD64RM * src)655 AMD64Instr* AMD64Instr_MulL ( Bool syned, AMD64RM* src ) {
656    AMD64Instr* i     = LibVEX_Alloc_inline(sizeof(AMD64Instr));
657    i->tag            = Ain_MulL;
658    i->Ain.MulL.syned = syned;
659    i->Ain.MulL.src   = src;
660    return i;
661 }
AMD64Instr_Div(Bool syned,Int sz,AMD64RM * src)662 AMD64Instr* AMD64Instr_Div ( Bool syned, Int sz, AMD64RM* src ) {
663    AMD64Instr* i     = LibVEX_Alloc_inline(sizeof(AMD64Instr));
664    i->tag            = Ain_Div;
665    i->Ain.Div.syned  = syned;
666    i->Ain.Div.sz     = sz;
667    i->Ain.Div.src    = src;
668    vassert(sz == 4 || sz == 8);
669    return i;
670 }
AMD64Instr_Push(AMD64RMI * src)671 AMD64Instr* AMD64Instr_Push( AMD64RMI* src ) {
672    AMD64Instr* i   = LibVEX_Alloc_inline(sizeof(AMD64Instr));
673    i->tag          = Ain_Push;
674    i->Ain.Push.src = src;
675    return i;
676 }
AMD64Instr_Call(AMD64CondCode cond,Addr64 target,Int regparms,RetLoc rloc)677 AMD64Instr* AMD64Instr_Call ( AMD64CondCode cond, Addr64 target, Int regparms,
678                               RetLoc rloc ) {
679    AMD64Instr* i        = LibVEX_Alloc_inline(sizeof(AMD64Instr));
680    i->tag               = Ain_Call;
681    i->Ain.Call.cond     = cond;
682    i->Ain.Call.target   = target;
683    i->Ain.Call.regparms = regparms;
684    i->Ain.Call.rloc     = rloc;
685    vassert(regparms >= 0 && regparms <= 6);
686    vassert(is_sane_RetLoc(rloc));
687    return i;
688 }
689 
AMD64Instr_XDirect(Addr64 dstGA,AMD64AMode * amRIP,AMD64CondCode cond,Bool toFastEP)690 AMD64Instr* AMD64Instr_XDirect ( Addr64 dstGA, AMD64AMode* amRIP,
691                                  AMD64CondCode cond, Bool toFastEP ) {
692    AMD64Instr* i           = LibVEX_Alloc_inline(sizeof(AMD64Instr));
693    i->tag                  = Ain_XDirect;
694    i->Ain.XDirect.dstGA    = dstGA;
695    i->Ain.XDirect.amRIP    = amRIP;
696    i->Ain.XDirect.cond     = cond;
697    i->Ain.XDirect.toFastEP = toFastEP;
698    return i;
699 }
AMD64Instr_XIndir(HReg dstGA,AMD64AMode * amRIP,AMD64CondCode cond)700 AMD64Instr* AMD64Instr_XIndir ( HReg dstGA, AMD64AMode* amRIP,
701                                 AMD64CondCode cond ) {
702    AMD64Instr* i       = LibVEX_Alloc_inline(sizeof(AMD64Instr));
703    i->tag              = Ain_XIndir;
704    i->Ain.XIndir.dstGA = dstGA;
705    i->Ain.XIndir.amRIP = amRIP;
706    i->Ain.XIndir.cond  = cond;
707    return i;
708 }
AMD64Instr_XAssisted(HReg dstGA,AMD64AMode * amRIP,AMD64CondCode cond,IRJumpKind jk)709 AMD64Instr* AMD64Instr_XAssisted ( HReg dstGA, AMD64AMode* amRIP,
710                                    AMD64CondCode cond, IRJumpKind jk ) {
711    AMD64Instr* i          = LibVEX_Alloc_inline(sizeof(AMD64Instr));
712    i->tag                 = Ain_XAssisted;
713    i->Ain.XAssisted.dstGA = dstGA;
714    i->Ain.XAssisted.amRIP = amRIP;
715    i->Ain.XAssisted.cond  = cond;
716    i->Ain.XAssisted.jk    = jk;
717    return i;
718 }
719 
AMD64Instr_CMov64(AMD64CondCode cond,HReg src,HReg dst)720 AMD64Instr* AMD64Instr_CMov64 ( AMD64CondCode cond, HReg src, HReg dst ) {
721    AMD64Instr* i      = LibVEX_Alloc_inline(sizeof(AMD64Instr));
722    i->tag             = Ain_CMov64;
723    i->Ain.CMov64.cond = cond;
724    i->Ain.CMov64.src  = src;
725    i->Ain.CMov64.dst  = dst;
726    vassert(cond != Acc_ALWAYS);
727    return i;
728 }
AMD64Instr_CLoad(AMD64CondCode cond,UChar szB,AMD64AMode * addr,HReg dst)729 AMD64Instr* AMD64Instr_CLoad ( AMD64CondCode cond, UChar szB,
730                                AMD64AMode* addr, HReg dst ) {
731    AMD64Instr* i     = LibVEX_Alloc_inline(sizeof(AMD64Instr));
732    i->tag            = Ain_CLoad;
733    i->Ain.CLoad.cond = cond;
734    i->Ain.CLoad.szB  = szB;
735    i->Ain.CLoad.addr = addr;
736    i->Ain.CLoad.dst  = dst;
737    vassert(cond != Acc_ALWAYS && (szB == 4 || szB == 8));
738    return i;
739 }
AMD64Instr_CStore(AMD64CondCode cond,UChar szB,HReg src,AMD64AMode * addr)740 AMD64Instr* AMD64Instr_CStore ( AMD64CondCode cond, UChar szB,
741                                 HReg src, AMD64AMode* addr ) {
742    AMD64Instr* i      = LibVEX_Alloc_inline(sizeof(AMD64Instr));
743    i->tag             = Ain_CStore;
744    i->Ain.CStore.cond = cond;
745    i->Ain.CStore.szB  = szB;
746    i->Ain.CStore.src  = src;
747    i->Ain.CStore.addr = addr;
748    vassert(cond != Acc_ALWAYS && (szB == 4 || szB == 8));
749    return i;
750 }
AMD64Instr_MovxLQ(Bool syned,HReg src,HReg dst)751 AMD64Instr* AMD64Instr_MovxLQ ( Bool syned, HReg src, HReg dst ) {
752    AMD64Instr* i       = LibVEX_Alloc_inline(sizeof(AMD64Instr));
753    i->tag              = Ain_MovxLQ;
754    i->Ain.MovxLQ.syned = syned;
755    i->Ain.MovxLQ.src   = src;
756    i->Ain.MovxLQ.dst   = dst;
757    return i;
758 }
AMD64Instr_LoadEX(UChar szSmall,Bool syned,AMD64AMode * src,HReg dst)759 AMD64Instr* AMD64Instr_LoadEX ( UChar szSmall, Bool syned,
760                                 AMD64AMode* src, HReg dst ) {
761    AMD64Instr* i         = LibVEX_Alloc_inline(sizeof(AMD64Instr));
762    i->tag                = Ain_LoadEX;
763    i->Ain.LoadEX.szSmall = szSmall;
764    i->Ain.LoadEX.syned   = syned;
765    i->Ain.LoadEX.src     = src;
766    i->Ain.LoadEX.dst     = dst;
767    vassert(szSmall == 1 || szSmall == 2 || szSmall == 4);
768    return i;
769 }
AMD64Instr_Store(UChar sz,HReg src,AMD64AMode * dst)770 AMD64Instr* AMD64Instr_Store ( UChar sz, HReg src, AMD64AMode* dst ) {
771    AMD64Instr* i    = LibVEX_Alloc_inline(sizeof(AMD64Instr));
772    i->tag           = Ain_Store;
773    i->Ain.Store.sz  = sz;
774    i->Ain.Store.src = src;
775    i->Ain.Store.dst = dst;
776    vassert(sz == 1 || sz == 2 || sz == 4);
777    return i;
778 }
AMD64Instr_Set64(AMD64CondCode cond,HReg dst)779 AMD64Instr* AMD64Instr_Set64 ( AMD64CondCode cond, HReg dst ) {
780    AMD64Instr* i     = LibVEX_Alloc_inline(sizeof(AMD64Instr));
781    i->tag            = Ain_Set64;
782    i->Ain.Set64.cond = cond;
783    i->Ain.Set64.dst  = dst;
784    return i;
785 }
AMD64Instr_Bsfr64(Bool isFwds,HReg src,HReg dst)786 AMD64Instr* AMD64Instr_Bsfr64 ( Bool isFwds, HReg src, HReg dst ) {
787    AMD64Instr* i        = LibVEX_Alloc_inline(sizeof(AMD64Instr));
788    i->tag               = Ain_Bsfr64;
789    i->Ain.Bsfr64.isFwds = isFwds;
790    i->Ain.Bsfr64.src    = src;
791    i->Ain.Bsfr64.dst    = dst;
792    return i;
793 }
AMD64Instr_MFence(void)794 AMD64Instr* AMD64Instr_MFence ( void ) {
795    AMD64Instr* i = LibVEX_Alloc_inline(sizeof(AMD64Instr));
796    i->tag        = Ain_MFence;
797    return i;
798 }
AMD64Instr_ACAS(AMD64AMode * addr,UChar sz)799 AMD64Instr* AMD64Instr_ACAS ( AMD64AMode* addr, UChar sz ) {
800    AMD64Instr* i    = LibVEX_Alloc_inline(sizeof(AMD64Instr));
801    i->tag           = Ain_ACAS;
802    i->Ain.ACAS.addr = addr;
803    i->Ain.ACAS.sz   = sz;
804    vassert(sz == 8 || sz == 4 || sz == 2 || sz == 1);
805    return i;
806 }
AMD64Instr_DACAS(AMD64AMode * addr,UChar sz)807 AMD64Instr* AMD64Instr_DACAS ( AMD64AMode* addr, UChar sz ) {
808    AMD64Instr* i     = LibVEX_Alloc_inline(sizeof(AMD64Instr));
809    i->tag            = Ain_DACAS;
810    i->Ain.DACAS.addr = addr;
811    i->Ain.DACAS.sz   = sz;
812    vassert(sz == 8 || sz == 4);
813    return i;
814 }
815 
AMD64Instr_A87Free(Int nregs)816 AMD64Instr* AMD64Instr_A87Free ( Int nregs )
817 {
818    AMD64Instr* i        = LibVEX_Alloc_inline(sizeof(AMD64Instr));
819    i->tag               = Ain_A87Free;
820    i->Ain.A87Free.nregs = nregs;
821    vassert(nregs >= 1 && nregs <= 7);
822    return i;
823 }
AMD64Instr_A87PushPop(AMD64AMode * addr,Bool isPush,UChar szB)824 AMD64Instr* AMD64Instr_A87PushPop ( AMD64AMode* addr, Bool isPush, UChar szB )
825 {
826    AMD64Instr* i            = LibVEX_Alloc_inline(sizeof(AMD64Instr));
827    i->tag                   = Ain_A87PushPop;
828    i->Ain.A87PushPop.addr   = addr;
829    i->Ain.A87PushPop.isPush = isPush;
830    i->Ain.A87PushPop.szB    = szB;
831    vassert(szB == 8 || szB == 4);
832    return i;
833 }
AMD64Instr_A87FpOp(A87FpOp op)834 AMD64Instr* AMD64Instr_A87FpOp ( A87FpOp op )
835 {
836    AMD64Instr* i     = LibVEX_Alloc_inline(sizeof(AMD64Instr));
837    i->tag            = Ain_A87FpOp;
838    i->Ain.A87FpOp.op = op;
839    return i;
840 }
AMD64Instr_A87LdCW(AMD64AMode * addr)841 AMD64Instr* AMD64Instr_A87LdCW ( AMD64AMode* addr )
842 {
843    AMD64Instr* i       = LibVEX_Alloc_inline(sizeof(AMD64Instr));
844    i->tag              = Ain_A87LdCW;
845    i->Ain.A87LdCW.addr = addr;
846    return i;
847 }
AMD64Instr_A87StSW(AMD64AMode * addr)848 AMD64Instr* AMD64Instr_A87StSW ( AMD64AMode* addr )
849 {
850    AMD64Instr* i       = LibVEX_Alloc_inline(sizeof(AMD64Instr));
851    i->tag              = Ain_A87StSW;
852    i->Ain.A87StSW.addr = addr;
853    return i;
854 }
AMD64Instr_LdMXCSR(AMD64AMode * addr)855 AMD64Instr* AMD64Instr_LdMXCSR ( AMD64AMode* addr ) {
856    AMD64Instr* i         = LibVEX_Alloc_inline(sizeof(AMD64Instr));
857    i->tag                = Ain_LdMXCSR;
858    i->Ain.LdMXCSR.addr   = addr;
859    return i;
860 }
AMD64Instr_SseUComIS(Int sz,HReg srcL,HReg srcR,HReg dst)861 AMD64Instr* AMD64Instr_SseUComIS ( Int sz, HReg srcL, HReg srcR, HReg dst ) {
862    AMD64Instr* i         = LibVEX_Alloc_inline(sizeof(AMD64Instr));
863    i->tag                = Ain_SseUComIS;
864    i->Ain.SseUComIS.sz   = toUChar(sz);
865    i->Ain.SseUComIS.srcL = srcL;
866    i->Ain.SseUComIS.srcR = srcR;
867    i->Ain.SseUComIS.dst  = dst;
868    vassert(sz == 4 || sz == 8);
869    return i;
870 }
AMD64Instr_SseSI2SF(Int szS,Int szD,HReg src,HReg dst)871 AMD64Instr* AMD64Instr_SseSI2SF ( Int szS, Int szD, HReg src, HReg dst ) {
872    AMD64Instr* i       = LibVEX_Alloc_inline(sizeof(AMD64Instr));
873    i->tag              = Ain_SseSI2SF;
874    i->Ain.SseSI2SF.szS = toUChar(szS);
875    i->Ain.SseSI2SF.szD = toUChar(szD);
876    i->Ain.SseSI2SF.src = src;
877    i->Ain.SseSI2SF.dst = dst;
878    vassert(szS == 4 || szS == 8);
879    vassert(szD == 4 || szD == 8);
880    return i;
881 }
AMD64Instr_SseSF2SI(Int szS,Int szD,HReg src,HReg dst)882 AMD64Instr* AMD64Instr_SseSF2SI ( Int szS, Int szD, HReg src, HReg dst ) {
883    AMD64Instr* i       = LibVEX_Alloc_inline(sizeof(AMD64Instr));
884    i->tag              = Ain_SseSF2SI;
885    i->Ain.SseSF2SI.szS = toUChar(szS);
886    i->Ain.SseSF2SI.szD = toUChar(szD);
887    i->Ain.SseSF2SI.src = src;
888    i->Ain.SseSF2SI.dst = dst;
889    vassert(szS == 4 || szS == 8);
890    vassert(szD == 4 || szD == 8);
891    return i;
892 }
AMD64Instr_SseSDSS(Bool from64,HReg src,HReg dst)893 AMD64Instr* AMD64Instr_SseSDSS   ( Bool from64, HReg src, HReg dst )
894 {
895    AMD64Instr* i         = LibVEX_Alloc_inline(sizeof(AMD64Instr));
896    i->tag                = Ain_SseSDSS;
897    i->Ain.SseSDSS.from64 = from64;
898    i->Ain.SseSDSS.src    = src;
899    i->Ain.SseSDSS.dst    = dst;
900    return i;
901 }
AMD64Instr_SseLdSt(Bool isLoad,Int sz,HReg reg,AMD64AMode * addr)902 AMD64Instr* AMD64Instr_SseLdSt ( Bool isLoad, Int sz,
903                                  HReg reg, AMD64AMode* addr ) {
904    AMD64Instr* i         = LibVEX_Alloc_inline(sizeof(AMD64Instr));
905    i->tag                = Ain_SseLdSt;
906    i->Ain.SseLdSt.isLoad = isLoad;
907    i->Ain.SseLdSt.sz     = toUChar(sz);
908    i->Ain.SseLdSt.reg    = reg;
909    i->Ain.SseLdSt.addr   = addr;
910    vassert(sz == 4 || sz == 8 || sz == 16);
911    return i;
912 }
AMD64Instr_SseLdzLO(Int sz,HReg reg,AMD64AMode * addr)913 AMD64Instr* AMD64Instr_SseLdzLO  ( Int sz, HReg reg, AMD64AMode* addr )
914 {
915    AMD64Instr* i         = LibVEX_Alloc_inline(sizeof(AMD64Instr));
916    i->tag                = Ain_SseLdzLO;
917    i->Ain.SseLdzLO.sz    = sz;
918    i->Ain.SseLdzLO.reg   = reg;
919    i->Ain.SseLdzLO.addr  = addr;
920    vassert(sz == 4 || sz == 8);
921    return i;
922 }
AMD64Instr_Sse32Fx4(AMD64SseOp op,HReg src,HReg dst)923 AMD64Instr* AMD64Instr_Sse32Fx4 ( AMD64SseOp op, HReg src, HReg dst ) {
924    AMD64Instr* i       = LibVEX_Alloc_inline(sizeof(AMD64Instr));
925    i->tag              = Ain_Sse32Fx4;
926    i->Ain.Sse32Fx4.op  = op;
927    i->Ain.Sse32Fx4.src = src;
928    i->Ain.Sse32Fx4.dst = dst;
929    vassert(op != Asse_MOV);
930    return i;
931 }
AMD64Instr_Sse32FLo(AMD64SseOp op,HReg src,HReg dst)932 AMD64Instr* AMD64Instr_Sse32FLo ( AMD64SseOp op, HReg src, HReg dst ) {
933    AMD64Instr* i       = LibVEX_Alloc_inline(sizeof(AMD64Instr));
934    i->tag              = Ain_Sse32FLo;
935    i->Ain.Sse32FLo.op  = op;
936    i->Ain.Sse32FLo.src = src;
937    i->Ain.Sse32FLo.dst = dst;
938    vassert(op != Asse_MOV);
939    return i;
940 }
AMD64Instr_Sse64Fx2(AMD64SseOp op,HReg src,HReg dst)941 AMD64Instr* AMD64Instr_Sse64Fx2 ( AMD64SseOp op, HReg src, HReg dst ) {
942    AMD64Instr* i       = LibVEX_Alloc_inline(sizeof(AMD64Instr));
943    i->tag              = Ain_Sse64Fx2;
944    i->Ain.Sse64Fx2.op  = op;
945    i->Ain.Sse64Fx2.src = src;
946    i->Ain.Sse64Fx2.dst = dst;
947    vassert(op != Asse_MOV);
948    return i;
949 }
AMD64Instr_Sse64FLo(AMD64SseOp op,HReg src,HReg dst)950 AMD64Instr* AMD64Instr_Sse64FLo ( AMD64SseOp op, HReg src, HReg dst ) {
951    AMD64Instr* i       = LibVEX_Alloc_inline(sizeof(AMD64Instr));
952    i->tag              = Ain_Sse64FLo;
953    i->Ain.Sse64FLo.op  = op;
954    i->Ain.Sse64FLo.src = src;
955    i->Ain.Sse64FLo.dst = dst;
956    vassert(op != Asse_MOV);
957    return i;
958 }
AMD64Instr_SseReRg(AMD64SseOp op,HReg re,HReg rg)959 AMD64Instr* AMD64Instr_SseReRg ( AMD64SseOp op, HReg re, HReg rg ) {
960    AMD64Instr* i      = LibVEX_Alloc_inline(sizeof(AMD64Instr));
961    i->tag             = Ain_SseReRg;
962    i->Ain.SseReRg.op  = op;
963    i->Ain.SseReRg.src = re;
964    i->Ain.SseReRg.dst = rg;
965    return i;
966 }
AMD64Instr_SseCMov(AMD64CondCode cond,HReg src,HReg dst)967 AMD64Instr* AMD64Instr_SseCMov ( AMD64CondCode cond, HReg src, HReg dst ) {
968    AMD64Instr* i       = LibVEX_Alloc_inline(sizeof(AMD64Instr));
969    i->tag              = Ain_SseCMov;
970    i->Ain.SseCMov.cond = cond;
971    i->Ain.SseCMov.src  = src;
972    i->Ain.SseCMov.dst  = dst;
973    vassert(cond != Acc_ALWAYS);
974    return i;
975 }
AMD64Instr_SseShuf(Int order,HReg src,HReg dst)976 AMD64Instr* AMD64Instr_SseShuf ( Int order, HReg src, HReg dst ) {
977    AMD64Instr* i        = LibVEX_Alloc_inline(sizeof(AMD64Instr));
978    i->tag               = Ain_SseShuf;
979    i->Ain.SseShuf.order = order;
980    i->Ain.SseShuf.src   = src;
981    i->Ain.SseShuf.dst   = dst;
982    vassert(order >= 0 && order <= 0xFF);
983    return i;
984 }
985 //uu AMD64Instr* AMD64Instr_AvxLdSt ( Bool isLoad,
986 //uu                                  HReg reg, AMD64AMode* addr ) {
987 //uu    AMD64Instr* i         = LibVEX_Alloc_inline(sizeof(AMD64Instr));
988 //uu    i->tag                = Ain_AvxLdSt;
989 //uu    i->Ain.AvxLdSt.isLoad = isLoad;
990 //uu    i->Ain.AvxLdSt.reg    = reg;
991 //uu    i->Ain.AvxLdSt.addr   = addr;
992 //uu    return i;
993 //uu }
994 //uu AMD64Instr* AMD64Instr_AvxReRg ( AMD64SseOp op, HReg re, HReg rg ) {
995 //uu    AMD64Instr* i      = LibVEX_Alloc_inline(sizeof(AMD64Instr));
996 //uu    i->tag             = Ain_AvxReRg;
997 //uu    i->Ain.AvxReRg.op  = op;
998 //uu    i->Ain.AvxReRg.src = re;
999 //uu    i->Ain.AvxReRg.dst = rg;
1000 //uu    return i;
1001 //uu }
AMD64Instr_EvCheck(AMD64AMode * amCounter,AMD64AMode * amFailAddr)1002 AMD64Instr* AMD64Instr_EvCheck ( AMD64AMode* amCounter,
1003                                  AMD64AMode* amFailAddr ) {
1004    AMD64Instr* i             = LibVEX_Alloc_inline(sizeof(AMD64Instr));
1005    i->tag                    = Ain_EvCheck;
1006    i->Ain.EvCheck.amCounter  = amCounter;
1007    i->Ain.EvCheck.amFailAddr = amFailAddr;
1008    return i;
1009 }
AMD64Instr_ProfInc(void)1010 AMD64Instr* AMD64Instr_ProfInc ( void ) {
1011    AMD64Instr* i = LibVEX_Alloc_inline(sizeof(AMD64Instr));
1012    i->tag        = Ain_ProfInc;
1013    return i;
1014 }
1015 
ppAMD64Instr(const AMD64Instr * i,Bool mode64)1016 void ppAMD64Instr ( const AMD64Instr* i, Bool mode64 )
1017 {
1018    vassert(mode64 == True);
1019    switch (i->tag) {
1020       case Ain_Imm64:
1021          vex_printf("movabsq $0x%llx,", i->Ain.Imm64.imm64);
1022          ppHRegAMD64(i->Ain.Imm64.dst);
1023          return;
1024       case Ain_Alu64R:
1025          vex_printf("%sq ", showAMD64AluOp(i->Ain.Alu64R.op));
1026          ppAMD64RMI(i->Ain.Alu64R.src);
1027          vex_printf(",");
1028          ppHRegAMD64(i->Ain.Alu64R.dst);
1029          return;
1030       case Ain_Alu64M:
1031          vex_printf("%sq ", showAMD64AluOp(i->Ain.Alu64M.op));
1032          ppAMD64RI(i->Ain.Alu64M.src);
1033          vex_printf(",");
1034          ppAMD64AMode(i->Ain.Alu64M.dst);
1035          return;
1036       case Ain_Sh64:
1037          vex_printf("%sq ", showAMD64ShiftOp(i->Ain.Sh64.op));
1038          if (i->Ain.Sh64.src == 0)
1039             vex_printf("%%cl,");
1040          else
1041             vex_printf("$%d,", (Int)i->Ain.Sh64.src);
1042          ppHRegAMD64(i->Ain.Sh64.dst);
1043          return;
1044       case Ain_Test64:
1045          vex_printf("testq $%d,", (Int)i->Ain.Test64.imm32);
1046          ppHRegAMD64(i->Ain.Test64.dst);
1047          return;
1048       case Ain_Unary64:
1049          vex_printf("%sq ", showAMD64UnaryOp(i->Ain.Unary64.op));
1050          ppHRegAMD64(i->Ain.Unary64.dst);
1051          return;
1052       case Ain_Lea64:
1053          vex_printf("leaq ");
1054          ppAMD64AMode(i->Ain.Lea64.am);
1055          vex_printf(",");
1056          ppHRegAMD64(i->Ain.Lea64.dst);
1057          return;
1058       case Ain_Alu32R:
1059          vex_printf("%sl ", showAMD64AluOp(i->Ain.Alu32R.op));
1060          ppAMD64RMI_lo32(i->Ain.Alu32R.src);
1061          vex_printf(",");
1062          ppHRegAMD64_lo32(i->Ain.Alu32R.dst);
1063          return;
1064       case Ain_MulL:
1065          vex_printf("%cmulq ", i->Ain.MulL.syned ? 's' : 'u');
1066          ppAMD64RM(i->Ain.MulL.src);
1067          return;
1068       case Ain_Div:
1069          vex_printf("%cdiv%s ",
1070                     i->Ain.Div.syned ? 's' : 'u',
1071                     showAMD64ScalarSz(i->Ain.Div.sz));
1072          ppAMD64RM(i->Ain.Div.src);
1073          return;
1074       case Ain_Push:
1075          vex_printf("pushq ");
1076          ppAMD64RMI(i->Ain.Push.src);
1077          return;
1078       case Ain_Call:
1079          vex_printf("call%s[%d,",
1080                     i->Ain.Call.cond==Acc_ALWAYS
1081                        ? "" : showAMD64CondCode(i->Ain.Call.cond),
1082                     i->Ain.Call.regparms );
1083          ppRetLoc(i->Ain.Call.rloc);
1084          vex_printf("] 0x%llx", i->Ain.Call.target);
1085          break;
1086 
1087       case Ain_XDirect:
1088          vex_printf("(xDirect) ");
1089          vex_printf("if (%%rflags.%s) { ",
1090                     showAMD64CondCode(i->Ain.XDirect.cond));
1091          vex_printf("movabsq $0x%llx,%%r11; ", i->Ain.XDirect.dstGA);
1092          vex_printf("movq %%r11,");
1093          ppAMD64AMode(i->Ain.XDirect.amRIP);
1094          vex_printf("; ");
1095          vex_printf("movabsq $disp_cp_chain_me_to_%sEP,%%r11; call *%%r11 }",
1096                     i->Ain.XDirect.toFastEP ? "fast" : "slow");
1097          return;
1098       case Ain_XIndir:
1099          vex_printf("(xIndir) ");
1100          vex_printf("if (%%rflags.%s) { ",
1101                     showAMD64CondCode(i->Ain.XIndir.cond));
1102          vex_printf("movq ");
1103          ppHRegAMD64(i->Ain.XIndir.dstGA);
1104          vex_printf(",");
1105          ppAMD64AMode(i->Ain.XIndir.amRIP);
1106          vex_printf("; movabsq $disp_indir,%%r11; jmp *%%r11 }");
1107          return;
1108       case Ain_XAssisted:
1109          vex_printf("(xAssisted) ");
1110          vex_printf("if (%%rflags.%s) { ",
1111                     showAMD64CondCode(i->Ain.XAssisted.cond));
1112          vex_printf("movq ");
1113          ppHRegAMD64(i->Ain.XAssisted.dstGA);
1114          vex_printf(",");
1115          ppAMD64AMode(i->Ain.XAssisted.amRIP);
1116          vex_printf("; movl $IRJumpKind_to_TRCVAL(%d),%%rbp",
1117                     (Int)i->Ain.XAssisted.jk);
1118          vex_printf("; movabsq $disp_assisted,%%r11; jmp *%%r11 }");
1119          return;
1120 
1121       case Ain_CMov64:
1122          vex_printf("cmov%s ", showAMD64CondCode(i->Ain.CMov64.cond));
1123          ppHRegAMD64(i->Ain.CMov64.src);
1124          vex_printf(",");
1125          ppHRegAMD64(i->Ain.CMov64.dst);
1126          return;
1127       case Ain_CLoad:
1128          vex_printf("if (%%rflags.%s) { ",
1129                     showAMD64CondCode(i->Ain.CLoad.cond));
1130          vex_printf("mov%c ", i->Ain.CLoad.szB == 4 ? 'l' : 'q');
1131          ppAMD64AMode(i->Ain.CLoad.addr);
1132          vex_printf(", ");
1133          (i->Ain.CLoad.szB == 4 ? ppHRegAMD64_lo32 : ppHRegAMD64)
1134             (i->Ain.CLoad.dst);
1135          vex_printf(" }");
1136          return;
1137       case Ain_CStore:
1138          vex_printf("if (%%rflags.%s) { ",
1139                     showAMD64CondCode(i->Ain.CStore.cond));
1140          vex_printf("mov%c ", i->Ain.CStore.szB == 4 ? 'l' : 'q');
1141          (i->Ain.CStore.szB == 4 ? ppHRegAMD64_lo32 : ppHRegAMD64)
1142             (i->Ain.CStore.src);
1143          vex_printf(", ");
1144          ppAMD64AMode(i->Ain.CStore.addr);
1145          vex_printf(" }");
1146          return;
1147 
1148       case Ain_MovxLQ:
1149          vex_printf("mov%clq ", i->Ain.MovxLQ.syned ? 's' : 'z');
1150          ppHRegAMD64_lo32(i->Ain.MovxLQ.src);
1151          vex_printf(",");
1152          ppHRegAMD64(i->Ain.MovxLQ.dst);
1153          return;
1154       case Ain_LoadEX:
1155          if (i->Ain.LoadEX.szSmall==4 && !i->Ain.LoadEX.syned) {
1156             vex_printf("movl ");
1157             ppAMD64AMode(i->Ain.LoadEX.src);
1158             vex_printf(",");
1159             ppHRegAMD64_lo32(i->Ain.LoadEX.dst);
1160          } else {
1161             vex_printf("mov%c%cq ",
1162                        i->Ain.LoadEX.syned ? 's' : 'z',
1163                        i->Ain.LoadEX.szSmall==1
1164                           ? 'b'
1165                           : (i->Ain.LoadEX.szSmall==2 ? 'w' : 'l'));
1166             ppAMD64AMode(i->Ain.LoadEX.src);
1167             vex_printf(",");
1168             ppHRegAMD64(i->Ain.LoadEX.dst);
1169          }
1170          return;
1171       case Ain_Store:
1172          vex_printf("mov%c ", i->Ain.Store.sz==1 ? 'b'
1173                               : (i->Ain.Store.sz==2 ? 'w' : 'l'));
1174          ppHRegAMD64(i->Ain.Store.src);
1175          vex_printf(",");
1176          ppAMD64AMode(i->Ain.Store.dst);
1177          return;
1178       case Ain_Set64:
1179          vex_printf("setq%s ", showAMD64CondCode(i->Ain.Set64.cond));
1180          ppHRegAMD64(i->Ain.Set64.dst);
1181          return;
1182       case Ain_Bsfr64:
1183          vex_printf("bs%cq ", i->Ain.Bsfr64.isFwds ? 'f' : 'r');
1184          ppHRegAMD64(i->Ain.Bsfr64.src);
1185          vex_printf(",");
1186          ppHRegAMD64(i->Ain.Bsfr64.dst);
1187          return;
1188       case Ain_MFence:
1189          vex_printf("mfence" );
1190          return;
1191       case Ain_ACAS:
1192          vex_printf("lock cmpxchg%c ",
1193                      i->Ain.ACAS.sz==1 ? 'b' : i->Ain.ACAS.sz==2 ? 'w'
1194                      : i->Ain.ACAS.sz==4 ? 'l' : 'q' );
1195          vex_printf("{%%rax->%%rbx},");
1196          ppAMD64AMode(i->Ain.ACAS.addr);
1197          return;
1198       case Ain_DACAS:
1199          vex_printf("lock cmpxchg%db {%%rdx:%%rax->%%rcx:%%rbx},",
1200                     (Int)(2 * i->Ain.DACAS.sz));
1201          ppAMD64AMode(i->Ain.DACAS.addr);
1202          return;
1203       case Ain_A87Free:
1204          vex_printf("ffree %%st(7..%d)", 8 - i->Ain.A87Free.nregs );
1205          break;
1206       case Ain_A87PushPop:
1207          vex_printf(i->Ain.A87PushPop.isPush ? "fld%c " : "fstp%c ",
1208                     i->Ain.A87PushPop.szB == 4 ? 's' : 'l');
1209          ppAMD64AMode(i->Ain.A87PushPop.addr);
1210          break;
1211       case Ain_A87FpOp:
1212          vex_printf("f%s", showA87FpOp(i->Ain.A87FpOp.op));
1213          break;
1214       case Ain_A87LdCW:
1215          vex_printf("fldcw ");
1216          ppAMD64AMode(i->Ain.A87LdCW.addr);
1217          break;
1218       case Ain_A87StSW:
1219          vex_printf("fstsw ");
1220          ppAMD64AMode(i->Ain.A87StSW.addr);
1221          break;
1222       case Ain_LdMXCSR:
1223          vex_printf("ldmxcsr ");
1224          ppAMD64AMode(i->Ain.LdMXCSR.addr);
1225          break;
1226       case Ain_SseUComIS:
1227          vex_printf("ucomis%s ", i->Ain.SseUComIS.sz==4 ? "s" : "d");
1228          ppHRegAMD64(i->Ain.SseUComIS.srcL);
1229          vex_printf(",");
1230          ppHRegAMD64(i->Ain.SseUComIS.srcR);
1231          vex_printf(" ; pushfq ; popq ");
1232          ppHRegAMD64(i->Ain.SseUComIS.dst);
1233          break;
1234       case Ain_SseSI2SF:
1235          vex_printf("cvtsi2s%s ", i->Ain.SseSI2SF.szD==4 ? "s" : "d");
1236          (i->Ain.SseSI2SF.szS==4 ? ppHRegAMD64_lo32 : ppHRegAMD64)
1237             (i->Ain.SseSI2SF.src);
1238          vex_printf(",");
1239          ppHRegAMD64(i->Ain.SseSI2SF.dst);
1240          break;
1241       case Ain_SseSF2SI:
1242          vex_printf("cvts%s2si ", i->Ain.SseSF2SI.szS==4 ? "s" : "d");
1243          ppHRegAMD64(i->Ain.SseSF2SI.src);
1244          vex_printf(",");
1245          (i->Ain.SseSF2SI.szD==4 ? ppHRegAMD64_lo32 : ppHRegAMD64)
1246             (i->Ain.SseSF2SI.dst);
1247          break;
1248       case Ain_SseSDSS:
1249          vex_printf(i->Ain.SseSDSS.from64 ? "cvtsd2ss " : "cvtss2sd ");
1250          ppHRegAMD64(i->Ain.SseSDSS.src);
1251          vex_printf(",");
1252          ppHRegAMD64(i->Ain.SseSDSS.dst);
1253          break;
1254       case Ain_SseLdSt:
1255          switch (i->Ain.SseLdSt.sz) {
1256             case 4:  vex_printf("movss "); break;
1257             case 8:  vex_printf("movsd "); break;
1258             case 16: vex_printf("movups "); break;
1259             default: vassert(0);
1260          }
1261          if (i->Ain.SseLdSt.isLoad) {
1262             ppAMD64AMode(i->Ain.SseLdSt.addr);
1263             vex_printf(",");
1264             ppHRegAMD64(i->Ain.SseLdSt.reg);
1265          } else {
1266             ppHRegAMD64(i->Ain.SseLdSt.reg);
1267             vex_printf(",");
1268             ppAMD64AMode(i->Ain.SseLdSt.addr);
1269          }
1270          return;
1271       case Ain_SseLdzLO:
1272          vex_printf("movs%s ", i->Ain.SseLdzLO.sz==4 ? "s" : "d");
1273          ppAMD64AMode(i->Ain.SseLdzLO.addr);
1274          vex_printf(",");
1275          ppHRegAMD64(i->Ain.SseLdzLO.reg);
1276          return;
1277       case Ain_Sse32Fx4:
1278          vex_printf("%sps ", showAMD64SseOp(i->Ain.Sse32Fx4.op));
1279          ppHRegAMD64(i->Ain.Sse32Fx4.src);
1280          vex_printf(",");
1281          ppHRegAMD64(i->Ain.Sse32Fx4.dst);
1282          return;
1283       case Ain_Sse32FLo:
1284          vex_printf("%sss ", showAMD64SseOp(i->Ain.Sse32FLo.op));
1285          ppHRegAMD64(i->Ain.Sse32FLo.src);
1286          vex_printf(",");
1287          ppHRegAMD64(i->Ain.Sse32FLo.dst);
1288          return;
1289       case Ain_Sse64Fx2:
1290          vex_printf("%spd ", showAMD64SseOp(i->Ain.Sse64Fx2.op));
1291          ppHRegAMD64(i->Ain.Sse64Fx2.src);
1292          vex_printf(",");
1293          ppHRegAMD64(i->Ain.Sse64Fx2.dst);
1294          return;
1295       case Ain_Sse64FLo:
1296          vex_printf("%ssd ", showAMD64SseOp(i->Ain.Sse64FLo.op));
1297          ppHRegAMD64(i->Ain.Sse64FLo.src);
1298          vex_printf(",");
1299          ppHRegAMD64(i->Ain.Sse64FLo.dst);
1300          return;
1301       case Ain_SseReRg:
1302          vex_printf("%s ", showAMD64SseOp(i->Ain.SseReRg.op));
1303          ppHRegAMD64(i->Ain.SseReRg.src);
1304          vex_printf(",");
1305          ppHRegAMD64(i->Ain.SseReRg.dst);
1306          return;
1307       case Ain_SseCMov:
1308          vex_printf("cmov%s ", showAMD64CondCode(i->Ain.SseCMov.cond));
1309          ppHRegAMD64(i->Ain.SseCMov.src);
1310          vex_printf(",");
1311          ppHRegAMD64(i->Ain.SseCMov.dst);
1312          return;
1313       case Ain_SseShuf:
1314          vex_printf("pshufd $0x%x,", i->Ain.SseShuf.order);
1315          ppHRegAMD64(i->Ain.SseShuf.src);
1316          vex_printf(",");
1317          ppHRegAMD64(i->Ain.SseShuf.dst);
1318          return;
1319       //uu case Ain_AvxLdSt:
1320       //uu    vex_printf("vmovups ");
1321       //uu    if (i->Ain.AvxLdSt.isLoad) {
1322       //uu       ppAMD64AMode(i->Ain.AvxLdSt.addr);
1323       //uu       vex_printf(",");
1324       //uu       ppHRegAMD64(i->Ain.AvxLdSt.reg);
1325       //uu    } else {
1326       //uu       ppHRegAMD64(i->Ain.AvxLdSt.reg);
1327       //uu       vex_printf(",");
1328       //uu       ppAMD64AMode(i->Ain.AvxLdSt.addr);
1329       //uu    }
1330       //uu    return;
1331       //uu case Ain_AvxReRg:
1332       //uu    vex_printf("v%s ", showAMD64SseOp(i->Ain.SseReRg.op));
1333       //uu    ppHRegAMD64(i->Ain.AvxReRg.src);
1334       //uu    vex_printf(",");
1335       //uu    ppHRegAMD64(i->Ain.AvxReRg.dst);
1336       //uu    return;
1337       case Ain_EvCheck:
1338          vex_printf("(evCheck) decl ");
1339          ppAMD64AMode(i->Ain.EvCheck.amCounter);
1340          vex_printf("; jns nofail; jmp *");
1341          ppAMD64AMode(i->Ain.EvCheck.amFailAddr);
1342          vex_printf("; nofail:");
1343          return;
1344       case Ain_ProfInc:
1345          vex_printf("(profInc) movabsq $NotKnownYet, %%r11; incq (%%r11)");
1346          return;
1347       default:
1348          vpanic("ppAMD64Instr");
1349    }
1350 }
1351 
1352 /* --------- Helpers for register allocation. --------- */
1353 
getRegUsage_AMD64Instr(HRegUsage * u,const AMD64Instr * i,Bool mode64)1354 void getRegUsage_AMD64Instr ( HRegUsage* u, const AMD64Instr* i, Bool mode64 )
1355 {
1356    Bool unary;
1357    vassert(mode64 == True);
1358    initHRegUsage(u);
1359    switch (i->tag) {
1360       case Ain_Imm64:
1361          addHRegUse(u, HRmWrite, i->Ain.Imm64.dst);
1362          return;
1363       case Ain_Alu64R:
1364          addRegUsage_AMD64RMI(u, i->Ain.Alu64R.src);
1365          if (i->Ain.Alu64R.op == Aalu_MOV) {
1366             addHRegUse(u, HRmWrite, i->Ain.Alu64R.dst);
1367             return;
1368          }
1369          if (i->Ain.Alu64R.op == Aalu_CMP) {
1370             addHRegUse(u, HRmRead, i->Ain.Alu64R.dst);
1371             return;
1372          }
1373          addHRegUse(u, HRmModify, i->Ain.Alu64R.dst);
1374          return;
1375       case Ain_Alu64M:
1376          addRegUsage_AMD64RI(u, i->Ain.Alu64M.src);
1377          addRegUsage_AMD64AMode(u, i->Ain.Alu64M.dst);
1378          return;
1379       case Ain_Sh64:
1380          addHRegUse(u, HRmModify, i->Ain.Sh64.dst);
1381          if (i->Ain.Sh64.src == 0)
1382             addHRegUse(u, HRmRead, hregAMD64_RCX());
1383          return;
1384       case Ain_Test64:
1385          addHRegUse(u, HRmRead, i->Ain.Test64.dst);
1386          return;
1387       case Ain_Unary64:
1388          addHRegUse(u, HRmModify, i->Ain.Unary64.dst);
1389          return;
1390       case Ain_Lea64:
1391          addRegUsage_AMD64AMode(u, i->Ain.Lea64.am);
1392          addHRegUse(u, HRmWrite, i->Ain.Lea64.dst);
1393          return;
1394       case Ain_Alu32R:
1395          vassert(i->Ain.Alu32R.op != Aalu_MOV);
1396          addRegUsage_AMD64RMI(u, i->Ain.Alu32R.src);
1397          if (i->Ain.Alu32R.op == Aalu_CMP) {
1398             addHRegUse(u, HRmRead, i->Ain.Alu32R.dst);
1399             return;
1400          }
1401          addHRegUse(u, HRmModify, i->Ain.Alu32R.dst);
1402          return;
1403       case Ain_MulL:
1404          addRegUsage_AMD64RM(u, i->Ain.MulL.src, HRmRead);
1405          addHRegUse(u, HRmModify, hregAMD64_RAX());
1406          addHRegUse(u, HRmWrite, hregAMD64_RDX());
1407          return;
1408       case Ain_Div:
1409          addRegUsage_AMD64RM(u, i->Ain.Div.src, HRmRead);
1410          addHRegUse(u, HRmModify, hregAMD64_RAX());
1411          addHRegUse(u, HRmModify, hregAMD64_RDX());
1412          return;
1413       case Ain_Push:
1414          addRegUsage_AMD64RMI(u, i->Ain.Push.src);
1415          addHRegUse(u, HRmModify, hregAMD64_RSP());
1416          return;
1417       case Ain_Call:
1418          /* This is a bit subtle. */
1419          /* First off, claim it trashes all the caller-saved regs
1420             which fall within the register allocator's jurisdiction.
1421             These I believe to be: rax rcx rdx rsi rdi r8 r9 r10 r11
1422             and all the xmm registers.
1423          */
1424          addHRegUse(u, HRmWrite, hregAMD64_RAX());
1425          addHRegUse(u, HRmWrite, hregAMD64_RCX());
1426          addHRegUse(u, HRmWrite, hregAMD64_RDX());
1427          addHRegUse(u, HRmWrite, hregAMD64_RSI());
1428          addHRegUse(u, HRmWrite, hregAMD64_RDI());
1429          addHRegUse(u, HRmWrite, hregAMD64_R8());
1430          addHRegUse(u, HRmWrite, hregAMD64_R9());
1431          addHRegUse(u, HRmWrite, hregAMD64_R10());
1432          addHRegUse(u, HRmWrite, hregAMD64_R11());
1433          addHRegUse(u, HRmWrite, hregAMD64_XMM0());
1434          addHRegUse(u, HRmWrite, hregAMD64_XMM1());
1435          addHRegUse(u, HRmWrite, hregAMD64_XMM3());
1436          addHRegUse(u, HRmWrite, hregAMD64_XMM4());
1437          addHRegUse(u, HRmWrite, hregAMD64_XMM5());
1438          addHRegUse(u, HRmWrite, hregAMD64_XMM6());
1439          addHRegUse(u, HRmWrite, hregAMD64_XMM7());
1440          addHRegUse(u, HRmWrite, hregAMD64_XMM8());
1441          addHRegUse(u, HRmWrite, hregAMD64_XMM9());
1442          addHRegUse(u, HRmWrite, hregAMD64_XMM10());
1443          addHRegUse(u, HRmWrite, hregAMD64_XMM11());
1444          addHRegUse(u, HRmWrite, hregAMD64_XMM12());
1445 
1446          /* Now we have to state any parameter-carrying registers
1447             which might be read.  This depends on the regparmness. */
1448          switch (i->Ain.Call.regparms) {
1449             case 6: addHRegUse(u, HRmRead, hregAMD64_R9());  /*fallthru*/
1450             case 5: addHRegUse(u, HRmRead, hregAMD64_R8());  /*fallthru*/
1451             case 4: addHRegUse(u, HRmRead, hregAMD64_RCX()); /*fallthru*/
1452             case 3: addHRegUse(u, HRmRead, hregAMD64_RDX()); /*fallthru*/
1453             case 2: addHRegUse(u, HRmRead, hregAMD64_RSI()); /*fallthru*/
1454             case 1: addHRegUse(u, HRmRead, hregAMD64_RDI()); break;
1455             case 0: break;
1456             default: vpanic("getRegUsage_AMD64Instr:Call:regparms");
1457          }
1458          /* Finally, there is the issue that the insn trashes a
1459             register because the literal target address has to be
1460             loaded into a register.  Fortunately, r11 is stated in the
1461             ABI as a scratch register, and so seems a suitable victim.  */
1462          addHRegUse(u, HRmWrite, hregAMD64_R11());
1463          /* Upshot of this is that the assembler really must use r11,
1464             and no other, as a destination temporary. */
1465          return;
1466       /* XDirect/XIndir/XAssisted are also a bit subtle.  They
1467          conditionally exit the block.  Hence we only need to list (1)
1468          the registers that they read, and (2) the registers that they
1469          write in the case where the block is not exited.  (2) is
1470          empty, hence only (1) is relevant here. */
1471       case Ain_XDirect:
1472          /* Don't bother to mention the write to %r11, since it is not
1473             available to the allocator. */
1474          addRegUsage_AMD64AMode(u, i->Ain.XDirect.amRIP);
1475          return;
1476       case Ain_XIndir:
1477          /* Ditto re %r11 */
1478          addHRegUse(u, HRmRead, i->Ain.XIndir.dstGA);
1479          addRegUsage_AMD64AMode(u, i->Ain.XIndir.amRIP);
1480          return;
1481       case Ain_XAssisted:
1482          /* Ditto re %r11 and %rbp (the baseblock ptr) */
1483          addHRegUse(u, HRmRead, i->Ain.XAssisted.dstGA);
1484          addRegUsage_AMD64AMode(u, i->Ain.XAssisted.amRIP);
1485          return;
1486       case Ain_CMov64:
1487          addHRegUse(u, HRmRead,   i->Ain.CMov64.src);
1488          addHRegUse(u, HRmModify, i->Ain.CMov64.dst);
1489          return;
1490       case Ain_CLoad:
1491          addRegUsage_AMD64AMode(u, i->Ain.CLoad.addr);
1492          addHRegUse(u, HRmModify, i->Ain.CLoad.dst);
1493          return;
1494       case Ain_CStore:
1495          addRegUsage_AMD64AMode(u, i->Ain.CStore.addr);
1496          addHRegUse(u, HRmRead, i->Ain.CStore.src);
1497          return;
1498       case Ain_MovxLQ:
1499          addHRegUse(u, HRmRead,  i->Ain.MovxLQ.src);
1500          addHRegUse(u, HRmWrite, i->Ain.MovxLQ.dst);
1501          return;
1502       case Ain_LoadEX:
1503          addRegUsage_AMD64AMode(u, i->Ain.LoadEX.src);
1504          addHRegUse(u, HRmWrite, i->Ain.LoadEX.dst);
1505          return;
1506       case Ain_Store:
1507          addHRegUse(u, HRmRead, i->Ain.Store.src);
1508          addRegUsage_AMD64AMode(u, i->Ain.Store.dst);
1509          return;
1510       case Ain_Set64:
1511          addHRegUse(u, HRmWrite, i->Ain.Set64.dst);
1512          return;
1513       case Ain_Bsfr64:
1514          addHRegUse(u, HRmRead, i->Ain.Bsfr64.src);
1515          addHRegUse(u, HRmWrite, i->Ain.Bsfr64.dst);
1516          return;
1517       case Ain_MFence:
1518          return;
1519       case Ain_ACAS:
1520          addRegUsage_AMD64AMode(u, i->Ain.ACAS.addr);
1521          addHRegUse(u, HRmRead, hregAMD64_RBX());
1522          addHRegUse(u, HRmModify, hregAMD64_RAX());
1523          return;
1524       case Ain_DACAS:
1525          addRegUsage_AMD64AMode(u, i->Ain.DACAS.addr);
1526          addHRegUse(u, HRmRead, hregAMD64_RCX());
1527          addHRegUse(u, HRmRead, hregAMD64_RBX());
1528          addHRegUse(u, HRmModify, hregAMD64_RDX());
1529          addHRegUse(u, HRmModify, hregAMD64_RAX());
1530          return;
1531       case Ain_A87Free:
1532          return;
1533       case Ain_A87PushPop:
1534          addRegUsage_AMD64AMode(u, i->Ain.A87PushPop.addr);
1535          return;
1536       case Ain_A87FpOp:
1537          return;
1538       case Ain_A87LdCW:
1539          addRegUsage_AMD64AMode(u, i->Ain.A87LdCW.addr);
1540          return;
1541       case Ain_A87StSW:
1542          addRegUsage_AMD64AMode(u, i->Ain.A87StSW.addr);
1543          return;
1544       case Ain_LdMXCSR:
1545          addRegUsage_AMD64AMode(u, i->Ain.LdMXCSR.addr);
1546          return;
1547       case Ain_SseUComIS:
1548          addHRegUse(u, HRmRead,  i->Ain.SseUComIS.srcL);
1549          addHRegUse(u, HRmRead,  i->Ain.SseUComIS.srcR);
1550          addHRegUse(u, HRmWrite, i->Ain.SseUComIS.dst);
1551          return;
1552       case Ain_SseSI2SF:
1553          addHRegUse(u, HRmRead,  i->Ain.SseSI2SF.src);
1554          addHRegUse(u, HRmWrite, i->Ain.SseSI2SF.dst);
1555          return;
1556       case Ain_SseSF2SI:
1557          addHRegUse(u, HRmRead,  i->Ain.SseSF2SI.src);
1558          addHRegUse(u, HRmWrite, i->Ain.SseSF2SI.dst);
1559          return;
1560       case Ain_SseSDSS:
1561          addHRegUse(u, HRmRead,  i->Ain.SseSDSS.src);
1562          addHRegUse(u, HRmWrite, i->Ain.SseSDSS.dst);
1563          return;
1564       case Ain_SseLdSt:
1565          addRegUsage_AMD64AMode(u, i->Ain.SseLdSt.addr);
1566          addHRegUse(u, i->Ain.SseLdSt.isLoad ? HRmWrite : HRmRead,
1567                        i->Ain.SseLdSt.reg);
1568          return;
1569       case Ain_SseLdzLO:
1570          addRegUsage_AMD64AMode(u, i->Ain.SseLdzLO.addr);
1571          addHRegUse(u, HRmWrite, i->Ain.SseLdzLO.reg);
1572          return;
1573       case Ain_Sse32Fx4:
1574          vassert(i->Ain.Sse32Fx4.op != Asse_MOV);
1575          unary = toBool( i->Ain.Sse32Fx4.op == Asse_RCPF
1576                          || i->Ain.Sse32Fx4.op == Asse_RSQRTF
1577                          || i->Ain.Sse32Fx4.op == Asse_SQRTF );
1578          addHRegUse(u, HRmRead, i->Ain.Sse32Fx4.src);
1579          addHRegUse(u, unary ? HRmWrite : HRmModify,
1580                        i->Ain.Sse32Fx4.dst);
1581          return;
1582       case Ain_Sse32FLo:
1583          vassert(i->Ain.Sse32FLo.op != Asse_MOV);
1584          unary = toBool( i->Ain.Sse32FLo.op == Asse_RCPF
1585                          || i->Ain.Sse32FLo.op == Asse_RSQRTF
1586                          || i->Ain.Sse32FLo.op == Asse_SQRTF );
1587          addHRegUse(u, HRmRead, i->Ain.Sse32FLo.src);
1588          addHRegUse(u, unary ? HRmWrite : HRmModify,
1589                        i->Ain.Sse32FLo.dst);
1590          return;
1591       case Ain_Sse64Fx2:
1592          vassert(i->Ain.Sse64Fx2.op != Asse_MOV);
1593          unary = toBool( i->Ain.Sse64Fx2.op == Asse_RCPF
1594                          || i->Ain.Sse64Fx2.op == Asse_RSQRTF
1595                          || i->Ain.Sse64Fx2.op == Asse_SQRTF );
1596          addHRegUse(u, HRmRead, i->Ain.Sse64Fx2.src);
1597          addHRegUse(u, unary ? HRmWrite : HRmModify,
1598                        i->Ain.Sse64Fx2.dst);
1599          return;
1600       case Ain_Sse64FLo:
1601          vassert(i->Ain.Sse64FLo.op != Asse_MOV);
1602          unary = toBool( i->Ain.Sse64FLo.op == Asse_RCPF
1603                          || i->Ain.Sse64FLo.op == Asse_RSQRTF
1604                          || i->Ain.Sse64FLo.op == Asse_SQRTF );
1605          addHRegUse(u, HRmRead, i->Ain.Sse64FLo.src);
1606          addHRegUse(u, unary ? HRmWrite : HRmModify,
1607                        i->Ain.Sse64FLo.dst);
1608          return;
1609       case Ain_SseReRg:
1610          if ( (i->Ain.SseReRg.op == Asse_XOR
1611                || i->Ain.SseReRg.op == Asse_CMPEQ32)
1612               && sameHReg(i->Ain.SseReRg.src, i->Ain.SseReRg.dst)) {
1613             /* reg-alloc needs to understand 'xor r,r' and 'cmpeqd
1614                r,r' as a write of a value to r, and independent of any
1615                previous value in r */
1616             /* (as opposed to a rite of passage :-) */
1617             addHRegUse(u, HRmWrite, i->Ain.SseReRg.dst);
1618          } else {
1619             addHRegUse(u, HRmRead, i->Ain.SseReRg.src);
1620             addHRegUse(u, i->Ain.SseReRg.op == Asse_MOV
1621                              ? HRmWrite : HRmModify,
1622                           i->Ain.SseReRg.dst);
1623          }
1624          return;
1625       case Ain_SseCMov:
1626          addHRegUse(u, HRmRead,   i->Ain.SseCMov.src);
1627          addHRegUse(u, HRmModify, i->Ain.SseCMov.dst);
1628          return;
1629       case Ain_SseShuf:
1630          addHRegUse(u, HRmRead,  i->Ain.SseShuf.src);
1631          addHRegUse(u, HRmWrite, i->Ain.SseShuf.dst);
1632          return;
1633       //uu case Ain_AvxLdSt:
1634       //uu addRegUsage_AMD64AMode(u, i->Ain.AvxLdSt.addr);
1635       //uu addHRegUse(u, i->Ain.AvxLdSt.isLoad ? HRmWrite : HRmRead,
1636       //uu               i->Ain.AvxLdSt.reg);
1637       //uu return;
1638       //uu case Ain_AvxReRg:
1639       //uu    if ( (i->Ain.AvxReRg.op == Asse_XOR
1640       //uu          || i->Ain.AvxReRg.op == Asse_CMPEQ32)
1641       //uu         && i->Ain.AvxReRg.src == i->Ain.AvxReRg.dst) {
1642       //uu       /* See comments on the case for Ain_SseReRg. */
1643       //uu       addHRegUse(u, HRmWrite, i->Ain.AvxReRg.dst);
1644       //uu    } else {
1645       //uu       addHRegUse(u, HRmRead, i->Ain.AvxReRg.src);
1646       //uu       addHRegUse(u, i->Ain.AvxReRg.op == Asse_MOV
1647       //uu                        ? HRmWrite : HRmModify,
1648       //uu                     i->Ain.AvxReRg.dst);
1649       //uu    }
1650       //uu    return;
1651       case Ain_EvCheck:
1652          /* We expect both amodes only to mention %rbp, so this is in
1653             fact pointless, since %rbp isn't allocatable, but anyway.. */
1654          addRegUsage_AMD64AMode(u, i->Ain.EvCheck.amCounter);
1655          addRegUsage_AMD64AMode(u, i->Ain.EvCheck.amFailAddr);
1656          return;
1657       case Ain_ProfInc:
1658          addHRegUse(u, HRmWrite, hregAMD64_R11());
1659          return;
1660       default:
1661          ppAMD64Instr(i, mode64);
1662          vpanic("getRegUsage_AMD64Instr");
1663    }
1664 }
1665 
1666 /* local helper */
mapReg(HRegRemap * m,HReg * r)1667 static inline void mapReg(HRegRemap* m, HReg* r)
1668 {
1669    *r = lookupHRegRemap(m, *r);
1670 }
1671 
mapRegs_AMD64Instr(HRegRemap * m,AMD64Instr * i,Bool mode64)1672 void mapRegs_AMD64Instr ( HRegRemap* m, AMD64Instr* i, Bool mode64 )
1673 {
1674    vassert(mode64 == True);
1675    switch (i->tag) {
1676       case Ain_Imm64:
1677          mapReg(m, &i->Ain.Imm64.dst);
1678          return;
1679       case Ain_Alu64R:
1680          mapRegs_AMD64RMI(m, i->Ain.Alu64R.src);
1681          mapReg(m, &i->Ain.Alu64R.dst);
1682          return;
1683       case Ain_Alu64M:
1684          mapRegs_AMD64RI(m, i->Ain.Alu64M.src);
1685          mapRegs_AMD64AMode(m, i->Ain.Alu64M.dst);
1686          return;
1687       case Ain_Sh64:
1688          mapReg(m, &i->Ain.Sh64.dst);
1689          return;
1690       case Ain_Test64:
1691          mapReg(m, &i->Ain.Test64.dst);
1692          return;
1693       case Ain_Unary64:
1694          mapReg(m, &i->Ain.Unary64.dst);
1695          return;
1696       case Ain_Lea64:
1697          mapRegs_AMD64AMode(m, i->Ain.Lea64.am);
1698          mapReg(m, &i->Ain.Lea64.dst);
1699          return;
1700       case Ain_Alu32R:
1701          mapRegs_AMD64RMI(m, i->Ain.Alu32R.src);
1702          mapReg(m, &i->Ain.Alu32R.dst);
1703          return;
1704       case Ain_MulL:
1705          mapRegs_AMD64RM(m, i->Ain.MulL.src);
1706          return;
1707       case Ain_Div:
1708          mapRegs_AMD64RM(m, i->Ain.Div.src);
1709          return;
1710       case Ain_Push:
1711          mapRegs_AMD64RMI(m, i->Ain.Push.src);
1712          return;
1713       case Ain_Call:
1714          return;
1715       case Ain_XDirect:
1716          mapRegs_AMD64AMode(m, i->Ain.XDirect.amRIP);
1717          return;
1718       case Ain_XIndir:
1719          mapReg(m, &i->Ain.XIndir.dstGA);
1720          mapRegs_AMD64AMode(m, i->Ain.XIndir.amRIP);
1721          return;
1722       case Ain_XAssisted:
1723          mapReg(m, &i->Ain.XAssisted.dstGA);
1724          mapRegs_AMD64AMode(m, i->Ain.XAssisted.amRIP);
1725          return;
1726       case Ain_CMov64:
1727          mapReg(m, &i->Ain.CMov64.src);
1728          mapReg(m, &i->Ain.CMov64.dst);
1729          return;
1730       case Ain_CLoad:
1731          mapRegs_AMD64AMode(m, i->Ain.CLoad.addr);
1732          mapReg(m, &i->Ain.CLoad.dst);
1733          return;
1734       case Ain_CStore:
1735          mapRegs_AMD64AMode(m, i->Ain.CStore.addr);
1736          mapReg(m, &i->Ain.CStore.src);
1737          return;
1738       case Ain_MovxLQ:
1739          mapReg(m, &i->Ain.MovxLQ.src);
1740          mapReg(m, &i->Ain.MovxLQ.dst);
1741          return;
1742       case Ain_LoadEX:
1743          mapRegs_AMD64AMode(m, i->Ain.LoadEX.src);
1744          mapReg(m, &i->Ain.LoadEX.dst);
1745          return;
1746       case Ain_Store:
1747          mapReg(m, &i->Ain.Store.src);
1748          mapRegs_AMD64AMode(m, i->Ain.Store.dst);
1749          return;
1750       case Ain_Set64:
1751          mapReg(m, &i->Ain.Set64.dst);
1752          return;
1753       case Ain_Bsfr64:
1754          mapReg(m, &i->Ain.Bsfr64.src);
1755          mapReg(m, &i->Ain.Bsfr64.dst);
1756          return;
1757       case Ain_MFence:
1758          return;
1759       case Ain_ACAS:
1760          mapRegs_AMD64AMode(m, i->Ain.ACAS.addr);
1761          return;
1762       case Ain_DACAS:
1763          mapRegs_AMD64AMode(m, i->Ain.DACAS.addr);
1764          return;
1765       case Ain_A87Free:
1766          return;
1767       case Ain_A87PushPop:
1768          mapRegs_AMD64AMode(m, i->Ain.A87PushPop.addr);
1769          return;
1770       case Ain_A87FpOp:
1771          return;
1772       case Ain_A87LdCW:
1773          mapRegs_AMD64AMode(m, i->Ain.A87LdCW.addr);
1774          return;
1775       case Ain_A87StSW:
1776          mapRegs_AMD64AMode(m, i->Ain.A87StSW.addr);
1777          return;
1778       case Ain_LdMXCSR:
1779          mapRegs_AMD64AMode(m, i->Ain.LdMXCSR.addr);
1780          return;
1781       case Ain_SseUComIS:
1782          mapReg(m, &i->Ain.SseUComIS.srcL);
1783          mapReg(m, &i->Ain.SseUComIS.srcR);
1784          mapReg(m, &i->Ain.SseUComIS.dst);
1785          return;
1786       case Ain_SseSI2SF:
1787          mapReg(m, &i->Ain.SseSI2SF.src);
1788          mapReg(m, &i->Ain.SseSI2SF.dst);
1789          return;
1790       case Ain_SseSF2SI:
1791          mapReg(m, &i->Ain.SseSF2SI.src);
1792          mapReg(m, &i->Ain.SseSF2SI.dst);
1793          return;
1794       case Ain_SseSDSS:
1795          mapReg(m, &i->Ain.SseSDSS.src);
1796          mapReg(m, &i->Ain.SseSDSS.dst);
1797          return;
1798       case Ain_SseLdSt:
1799          mapReg(m, &i->Ain.SseLdSt.reg);
1800          mapRegs_AMD64AMode(m, i->Ain.SseLdSt.addr);
1801          break;
1802       case Ain_SseLdzLO:
1803          mapReg(m, &i->Ain.SseLdzLO.reg);
1804          mapRegs_AMD64AMode(m, i->Ain.SseLdzLO.addr);
1805          break;
1806       case Ain_Sse32Fx4:
1807          mapReg(m, &i->Ain.Sse32Fx4.src);
1808          mapReg(m, &i->Ain.Sse32Fx4.dst);
1809          return;
1810       case Ain_Sse32FLo:
1811          mapReg(m, &i->Ain.Sse32FLo.src);
1812          mapReg(m, &i->Ain.Sse32FLo.dst);
1813          return;
1814       case Ain_Sse64Fx2:
1815          mapReg(m, &i->Ain.Sse64Fx2.src);
1816          mapReg(m, &i->Ain.Sse64Fx2.dst);
1817          return;
1818       case Ain_Sse64FLo:
1819          mapReg(m, &i->Ain.Sse64FLo.src);
1820          mapReg(m, &i->Ain.Sse64FLo.dst);
1821          return;
1822       case Ain_SseReRg:
1823          mapReg(m, &i->Ain.SseReRg.src);
1824          mapReg(m, &i->Ain.SseReRg.dst);
1825          return;
1826       case Ain_SseCMov:
1827          mapReg(m, &i->Ain.SseCMov.src);
1828          mapReg(m, &i->Ain.SseCMov.dst);
1829          return;
1830       case Ain_SseShuf:
1831          mapReg(m, &i->Ain.SseShuf.src);
1832          mapReg(m, &i->Ain.SseShuf.dst);
1833          return;
1834       //uu case Ain_AvxLdSt:
1835       //uu    mapReg(m, &i->Ain.AvxLdSt.reg);
1836       //uu    mapRegs_AMD64AMode(m, i->Ain.AvxLdSt.addr);
1837       //uu    break;
1838       //uu case Ain_AvxReRg:
1839       //uu    mapReg(m, &i->Ain.AvxReRg.src);
1840       //uu    mapReg(m, &i->Ain.AvxReRg.dst);
1841       //uu    return;
1842       case Ain_EvCheck:
1843          /* We expect both amodes only to mention %rbp, so this is in
1844             fact pointless, since %rbp isn't allocatable, but anyway.. */
1845          mapRegs_AMD64AMode(m, i->Ain.EvCheck.amCounter);
1846          mapRegs_AMD64AMode(m, i->Ain.EvCheck.amFailAddr);
1847          return;
1848       case Ain_ProfInc:
1849          /* hardwires r11 -- nothing to modify. */
1850          return;
1851       default:
1852          ppAMD64Instr(i, mode64);
1853          vpanic("mapRegs_AMD64Instr");
1854    }
1855 }
1856 
1857 /* Figure out if i represents a reg-reg move, and if so assign the
1858    source and destination to *src and *dst.  If in doubt say No.  Used
1859    by the register allocator to do move coalescing.
1860 */
isMove_AMD64Instr(const AMD64Instr * i,HReg * src,HReg * dst)1861 Bool isMove_AMD64Instr ( const AMD64Instr* i, HReg* src, HReg* dst )
1862 {
1863    switch (i->tag) {
1864       case Ain_Alu64R:
1865          /* Moves between integer regs */
1866          if (i->Ain.Alu64R.op != Aalu_MOV)
1867             return False;
1868          if (i->Ain.Alu64R.src->tag != Armi_Reg)
1869             return False;
1870          *src = i->Ain.Alu64R.src->Armi.Reg.reg;
1871          *dst = i->Ain.Alu64R.dst;
1872          return True;
1873       case Ain_SseReRg:
1874          /* Moves between SSE regs */
1875          if (i->Ain.SseReRg.op != Asse_MOV)
1876             return False;
1877          *src = i->Ain.SseReRg.src;
1878          *dst = i->Ain.SseReRg.dst;
1879          return True;
1880       //uu case Ain_AvxReRg:
1881       //uu    /* Moves between AVX regs */
1882       //uu    if (i->Ain.AvxReRg.op != Asse_MOV)
1883       //uu       return False;
1884       //uu    *src = i->Ain.AvxReRg.src;
1885       //uu    *dst = i->Ain.AvxReRg.dst;
1886       //uu    return True;
1887       default:
1888          return False;
1889    }
1890    /*NOTREACHED*/
1891 }
1892 
1893 
1894 /* Generate amd64 spill/reload instructions under the direction of the
1895    register allocator.  Note it's critical these don't write the
1896    condition codes. */
1897 
genSpill_AMD64(HInstr ** i1,HInstr ** i2,HReg rreg,Int offsetB,Bool mode64)1898 void genSpill_AMD64 ( /*OUT*/HInstr** i1, /*OUT*/HInstr** i2,
1899                       HReg rreg, Int offsetB, Bool mode64 )
1900 {
1901    AMD64AMode* am;
1902    vassert(offsetB >= 0);
1903    vassert(!hregIsVirtual(rreg));
1904    vassert(mode64 == True);
1905    *i1 = *i2 = NULL;
1906    am = AMD64AMode_IR(offsetB, hregAMD64_RBP());
1907    switch (hregClass(rreg)) {
1908       case HRcInt64:
1909          *i1 = AMD64Instr_Alu64M ( Aalu_MOV, AMD64RI_Reg(rreg), am );
1910          return;
1911       case HRcVec128:
1912          *i1 = AMD64Instr_SseLdSt ( False/*store*/, 16, rreg, am );
1913          return;
1914       default:
1915          ppHRegClass(hregClass(rreg));
1916          vpanic("genSpill_AMD64: unimplemented regclass");
1917    }
1918 }
1919 
genReload_AMD64(HInstr ** i1,HInstr ** i2,HReg rreg,Int offsetB,Bool mode64)1920 void genReload_AMD64 ( /*OUT*/HInstr** i1, /*OUT*/HInstr** i2,
1921                        HReg rreg, Int offsetB, Bool mode64 )
1922 {
1923    AMD64AMode* am;
1924    vassert(offsetB >= 0);
1925    vassert(!hregIsVirtual(rreg));
1926    vassert(mode64 == True);
1927    *i1 = *i2 = NULL;
1928    am = AMD64AMode_IR(offsetB, hregAMD64_RBP());
1929    switch (hregClass(rreg)) {
1930       case HRcInt64:
1931          *i1 = AMD64Instr_Alu64R ( Aalu_MOV, AMD64RMI_Mem(am), rreg );
1932          return;
1933       case HRcVec128:
1934          *i1 = AMD64Instr_SseLdSt ( True/*load*/, 16, rreg, am );
1935          return;
1936       default:
1937          ppHRegClass(hregClass(rreg));
1938          vpanic("genReload_AMD64: unimplemented regclass");
1939    }
1940 }
1941 
1942 
1943 /* --------- The amd64 assembler (bleh.) --------- */
1944 
1945 /* Produce the low three bits of an integer register number. */
iregEnc210(HReg r)1946 inline static UInt iregEnc210 ( HReg r )
1947 {
1948    UInt n;
1949    vassert(hregClass(r) == HRcInt64);
1950    vassert(!hregIsVirtual(r));
1951    n = hregEncoding(r);
1952    vassert(n <= 15);
1953    return n & 7;
1954 }
1955 
1956 /* Produce bit 3 of an integer register number. */
iregEnc3(HReg r)1957 inline static UInt iregEnc3 ( HReg r )
1958 {
1959    UInt n;
1960    vassert(hregClass(r) == HRcInt64);
1961    vassert(!hregIsVirtual(r));
1962    n = hregEncoding(r);
1963    vassert(n <= 15);
1964    return (n >> 3) & 1;
1965 }
1966 
1967 /* Produce a complete 4-bit integer register number. */
iregEnc3210(HReg r)1968 inline static UInt iregEnc3210 ( HReg r )
1969 {
1970    UInt n;
1971    vassert(hregClass(r) == HRcInt64);
1972    vassert(!hregIsVirtual(r));
1973    n = hregEncoding(r);
1974    vassert(n <= 15);
1975    return n;
1976 }
1977 
1978 /* Produce a complete 4-bit integer register number. */
vregEnc3210(HReg r)1979 inline static UInt vregEnc3210 ( HReg r )
1980 {
1981    UInt n;
1982    vassert(hregClass(r) == HRcVec128);
1983    vassert(!hregIsVirtual(r));
1984    n = hregEncoding(r);
1985    vassert(n <= 15);
1986    return n;
1987 }
1988 
mkModRegRM(UInt mod,UInt reg,UInt regmem)1989 inline static UChar mkModRegRM ( UInt mod, UInt reg, UInt regmem )
1990 {
1991    vassert(mod < 4);
1992    vassert((reg|regmem) < 8);
1993    return (UChar)( ((mod & 3) << 6) | ((reg & 7) << 3) | (regmem & 7) );
1994 }
1995 
mkSIB(UInt shift,UInt regindex,UInt regbase)1996 inline static UChar mkSIB ( UInt shift, UInt regindex, UInt regbase )
1997 {
1998    vassert(shift < 4);
1999    vassert((regindex|regbase) < 8);
2000    return (UChar)( ((shift & 3) << 6) | ((regindex & 7) << 3) | (regbase & 7) );
2001 }
2002 
emit32(UChar * p,UInt w32)2003 static UChar* emit32 ( UChar* p, UInt w32 )
2004 {
2005    *p++ = toUChar((w32)       & 0x000000FF);
2006    *p++ = toUChar((w32 >>  8) & 0x000000FF);
2007    *p++ = toUChar((w32 >> 16) & 0x000000FF);
2008    *p++ = toUChar((w32 >> 24) & 0x000000FF);
2009    return p;
2010 }
2011 
emit64(UChar * p,ULong w64)2012 static UChar* emit64 ( UChar* p, ULong w64 )
2013 {
2014    p = emit32(p, toUInt(w64         & 0xFFFFFFFF));
2015    p = emit32(p, toUInt((w64 >> 32) & 0xFFFFFFFF));
2016    return p;
2017 }
2018 
2019 /* Does a sign-extend of the lowest 8 bits give
2020    the original number? */
fits8bits(UInt w32)2021 static Bool fits8bits ( UInt w32 )
2022 {
2023    Int i32 = (Int)w32;
2024    return toBool(i32 == ((Int)(w32 << 24) >> 24));
2025 }
2026 /* Can the lower 32 bits be signedly widened to produce the whole
2027    64-bit value?  In other words, are the top 33 bits either all 0 or
2028    all 1 ? */
fitsIn32Bits(ULong x)2029 static Bool fitsIn32Bits ( ULong x )
2030 {
2031    Long y1;
2032    y1 = x << 32;
2033    y1 >>=/*s*/ 32;
2034    return toBool(x == y1);
2035 }
2036 
2037 
2038 /* Forming mod-reg-rm bytes and scale-index-base bytes.
2039 
2040      greg,  0(ereg)    |  ereg is not any of: RSP RBP R12 R13
2041                        =  00 greg ereg
2042 
2043      greg,  d8(ereg)   |  ereg is neither of: RSP R12
2044                        =  01 greg ereg, d8
2045 
2046      greg,  d32(ereg)  |  ereg is neither of: RSP R12
2047                        =  10 greg ereg, d32
2048 
2049      greg,  d8(ereg)   |  ereg is either: RSP R12
2050                        =  01 greg 100, 0x24, d8
2051                        (lowest bit of rex distinguishes R12/RSP)
2052 
2053      greg,  d32(ereg)  |  ereg is either: RSP R12
2054                        =  10 greg 100, 0x24, d32
2055                        (lowest bit of rex distinguishes R12/RSP)
2056 
2057      -----------------------------------------------
2058 
2059      greg,  d8(base,index,scale)
2060                |  index != RSP
2061                =  01 greg 100, scale index base, d8
2062 
2063      greg,  d32(base,index,scale)
2064                |  index != RSP
2065                =  10 greg 100, scale index base, d32
2066 */
doAMode_M__wrk(UChar * p,UInt gregEnc3210,AMD64AMode * am)2067 static UChar* doAMode_M__wrk ( UChar* p, UInt gregEnc3210, AMD64AMode* am )
2068 {
2069    UInt gregEnc210 = gregEnc3210 & 7;
2070    if (am->tag == Aam_IR) {
2071       if (am->Aam.IR.imm == 0
2072           && ! sameHReg(am->Aam.IR.reg, hregAMD64_RSP())
2073           && ! sameHReg(am->Aam.IR.reg, hregAMD64_RBP())
2074           && ! sameHReg(am->Aam.IR.reg, hregAMD64_R12())
2075           && ! sameHReg(am->Aam.IR.reg, hregAMD64_R13())
2076          ) {
2077          *p++ = mkModRegRM(0, gregEnc210, iregEnc210(am->Aam.IR.reg));
2078          return p;
2079       }
2080       if (fits8bits(am->Aam.IR.imm)
2081           && ! sameHReg(am->Aam.IR.reg, hregAMD64_RSP())
2082           && ! sameHReg(am->Aam.IR.reg, hregAMD64_R12())
2083          ) {
2084          *p++ = mkModRegRM(1, gregEnc210, iregEnc210(am->Aam.IR.reg));
2085          *p++ = toUChar(am->Aam.IR.imm & 0xFF);
2086          return p;
2087       }
2088       if (! sameHReg(am->Aam.IR.reg, hregAMD64_RSP())
2089           && ! sameHReg(am->Aam.IR.reg, hregAMD64_R12())
2090          ) {
2091          *p++ = mkModRegRM(2, gregEnc210, iregEnc210(am->Aam.IR.reg));
2092          p = emit32(p, am->Aam.IR.imm);
2093          return p;
2094       }
2095       if ((sameHReg(am->Aam.IR.reg, hregAMD64_RSP())
2096            || sameHReg(am->Aam.IR.reg, hregAMD64_R12()))
2097           && fits8bits(am->Aam.IR.imm)) {
2098  	 *p++ = mkModRegRM(1, gregEnc210, 4);
2099          *p++ = 0x24;
2100          *p++ = toUChar(am->Aam.IR.imm & 0xFF);
2101          return p;
2102       }
2103       if (/* (sameHReg(am->Aam.IR.reg, hregAMD64_RSP())
2104 	      || wait for test case for RSP case */
2105           sameHReg(am->Aam.IR.reg, hregAMD64_R12())) {
2106  	 *p++ = mkModRegRM(2, gregEnc210, 4);
2107          *p++ = 0x24;
2108          p = emit32(p, am->Aam.IR.imm);
2109          return p;
2110       }
2111       ppAMD64AMode(am);
2112       vpanic("doAMode_M: can't emit amode IR");
2113       /*NOTREACHED*/
2114    }
2115    if (am->tag == Aam_IRRS) {
2116       if (fits8bits(am->Aam.IRRS.imm)
2117           && ! sameHReg(am->Aam.IRRS.index, hregAMD64_RSP())) {
2118          *p++ = mkModRegRM(1, gregEnc210, 4);
2119          *p++ = mkSIB(am->Aam.IRRS.shift, iregEnc210(am->Aam.IRRS.index),
2120                                           iregEnc210(am->Aam.IRRS.base));
2121          *p++ = toUChar(am->Aam.IRRS.imm & 0xFF);
2122          return p;
2123       }
2124       if (! sameHReg(am->Aam.IRRS.index, hregAMD64_RSP())) {
2125          *p++ = mkModRegRM(2, gregEnc210, 4);
2126          *p++ = mkSIB(am->Aam.IRRS.shift, iregEnc210(am->Aam.IRRS.index),
2127                                           iregEnc210(am->Aam.IRRS.base));
2128          p = emit32(p, am->Aam.IRRS.imm);
2129          return p;
2130       }
2131       ppAMD64AMode(am);
2132       vpanic("doAMode_M: can't emit amode IRRS");
2133       /*NOTREACHED*/
2134    }
2135    vpanic("doAMode_M: unknown amode");
2136    /*NOTREACHED*/
2137 }
2138 
doAMode_M(UChar * p,HReg greg,AMD64AMode * am)2139 static UChar* doAMode_M ( UChar* p, HReg greg, AMD64AMode* am )
2140 {
2141    return doAMode_M__wrk(p, iregEnc3210(greg), am);
2142 }
2143 
doAMode_M_enc(UChar * p,UInt gregEnc3210,AMD64AMode * am)2144 static UChar* doAMode_M_enc ( UChar* p, UInt gregEnc3210, AMD64AMode* am )
2145 {
2146    vassert(gregEnc3210 < 16);
2147    return doAMode_M__wrk(p, gregEnc3210, am);
2148 }
2149 
2150 
2151 /* Emit a mod-reg-rm byte when the rm bit denotes a reg. */
2152 inline
doAMode_R__wrk(UChar * p,UInt gregEnc3210,UInt eregEnc3210)2153 static UChar* doAMode_R__wrk ( UChar* p, UInt gregEnc3210, UInt eregEnc3210 )
2154 {
2155    *p++ = mkModRegRM(3, gregEnc3210 & 7, eregEnc3210 & 7);
2156    return p;
2157 }
2158 
doAMode_R(UChar * p,HReg greg,HReg ereg)2159 static UChar* doAMode_R ( UChar* p, HReg greg, HReg ereg )
2160 {
2161    return doAMode_R__wrk(p, iregEnc3210(greg), iregEnc3210(ereg));
2162 }
2163 
doAMode_R_enc_reg(UChar * p,UInt gregEnc3210,HReg ereg)2164 static UChar* doAMode_R_enc_reg ( UChar* p, UInt gregEnc3210, HReg ereg )
2165 {
2166    vassert(gregEnc3210 < 16);
2167    return doAMode_R__wrk(p, gregEnc3210, iregEnc3210(ereg));
2168 }
2169 
doAMode_R_reg_enc(UChar * p,HReg greg,UInt eregEnc3210)2170 static UChar* doAMode_R_reg_enc ( UChar* p, HReg greg, UInt eregEnc3210 )
2171 {
2172    vassert(eregEnc3210 < 16);
2173    return doAMode_R__wrk(p, iregEnc3210(greg), eregEnc3210);
2174 }
2175 
doAMode_R_enc_enc(UChar * p,UInt gregEnc3210,UInt eregEnc3210)2176 static UChar* doAMode_R_enc_enc ( UChar* p, UInt gregEnc3210, UInt eregEnc3210 )
2177 {
2178    vassert( (gregEnc3210|eregEnc3210) < 16);
2179    return doAMode_R__wrk(p, gregEnc3210, eregEnc3210);
2180 }
2181 
2182 
2183 /* Clear the W bit on a REX byte, thereby changing the operand size
2184    back to whatever that instruction's default operand size is. */
clearWBit(UChar rex)2185 static inline UChar clearWBit ( UChar rex )
2186 {
2187    return rex & ~(1<<3);
2188 }
2189 
2190 
2191 /* Make up a REX byte, with W=1 (size=64), for a (greg,amode) pair. */
rexAMode_M__wrk(UInt gregEnc3210,AMD64AMode * am)2192 inline static UChar rexAMode_M__wrk ( UInt gregEnc3210, AMD64AMode* am )
2193 {
2194    if (am->tag == Aam_IR) {
2195       UChar W = 1;  /* we want 64-bit mode */
2196       UChar R = (gregEnc3210 >> 3) & 1;
2197       UChar X = 0; /* not relevant */
2198       UChar B = iregEnc3(am->Aam.IR.reg);
2199       return 0x40 + ((W << 3) | (R << 2) | (X << 1) | (B << 0));
2200    }
2201    if (am->tag == Aam_IRRS) {
2202       UChar W = 1;  /* we want 64-bit mode */
2203       UChar R = (gregEnc3210 >> 3) & 1;
2204       UChar X = iregEnc3(am->Aam.IRRS.index);
2205       UChar B = iregEnc3(am->Aam.IRRS.base);
2206       return 0x40 + ((W << 3) | (R << 2) | (X << 1) | (B << 0));
2207    }
2208    vassert(0);
2209    return 0; /*NOTREACHED*/
2210 }
2211 
rexAMode_M(HReg greg,AMD64AMode * am)2212 static UChar rexAMode_M ( HReg greg, AMD64AMode* am )
2213 {
2214    return rexAMode_M__wrk(iregEnc3210(greg), am);
2215 }
2216 
rexAMode_M_enc(UInt gregEnc3210,AMD64AMode * am)2217 static UChar rexAMode_M_enc ( UInt gregEnc3210, AMD64AMode* am )
2218 {
2219    vassert(gregEnc3210 < 16);
2220    return rexAMode_M__wrk(gregEnc3210, am);
2221 }
2222 
2223 
2224 /* Make up a REX byte, with W=1 (size=64), for a (greg,ereg) pair. */
rexAMode_R__wrk(UInt gregEnc3210,UInt eregEnc3210)2225 inline static UChar rexAMode_R__wrk ( UInt gregEnc3210, UInt eregEnc3210 )
2226 {
2227    UChar W = 1;  /* we want 64-bit mode */
2228    UChar R = (gregEnc3210 >> 3) & 1;
2229    UChar X = 0; /* not relevant */
2230    UChar B = (eregEnc3210 >> 3) & 1;
2231    return 0x40 + ((W << 3) | (R << 2) | (X << 1) | (B << 0));
2232 }
2233 
rexAMode_R(HReg greg,HReg ereg)2234 static UChar rexAMode_R ( HReg greg, HReg ereg )
2235 {
2236    return rexAMode_R__wrk(iregEnc3210(greg), iregEnc3210(ereg));
2237 }
2238 
rexAMode_R_enc_reg(UInt gregEnc3210,HReg ereg)2239 static UChar rexAMode_R_enc_reg ( UInt gregEnc3210, HReg ereg )
2240 {
2241    vassert(gregEnc3210 < 16);
2242    return rexAMode_R__wrk(gregEnc3210, iregEnc3210(ereg));
2243 }
2244 
rexAMode_R_reg_enc(HReg greg,UInt eregEnc3210)2245 static UChar rexAMode_R_reg_enc ( HReg greg, UInt eregEnc3210 )
2246 {
2247    vassert(eregEnc3210 < 16);
2248    return rexAMode_R__wrk(iregEnc3210(greg), eregEnc3210);
2249 }
2250 
rexAMode_R_enc_enc(UInt gregEnc3210,UInt eregEnc3210)2251 static UChar rexAMode_R_enc_enc ( UInt gregEnc3210, UInt eregEnc3210 )
2252 {
2253    vassert((gregEnc3210|eregEnc3210) < 16);
2254    return rexAMode_R__wrk(gregEnc3210, eregEnc3210);
2255 }
2256 
2257 
2258 //uu /* May 2012: this VEX prefix stuff is currently unused, but has
2259 //uu    verified correct (I reckon).  Certainly it has been known to
2260 //uu    produce correct VEX prefixes during testing. */
2261 //uu
2262 //uu /* Assemble a 2 or 3 byte VEX prefix from parts.  rexR, rexX, rexB and
2263 //uu    notVvvvv need to be not-ed before packing.  mmmmm, rexW, L and pp go
2264 //uu    in verbatim.  There's no range checking on the bits. */
2265 //uu static UInt packVexPrefix ( UInt rexR, UInt rexX, UInt rexB,
2266 //uu                             UInt mmmmm, UInt rexW, UInt notVvvv,
2267 //uu                             UInt L, UInt pp )
2268 //uu {
2269 //uu    UChar byte0 = 0;
2270 //uu    UChar byte1 = 0;
2271 //uu    UChar byte2 = 0;
2272 //uu    if (rexX == 0 && rexB == 0 && mmmmm == 1 && rexW == 0) {
2273 //uu       /* 2 byte encoding is possible. */
2274 //uu       byte0 = 0xC5;
2275 //uu       byte1 = ((rexR ^ 1) << 7) | ((notVvvv ^ 0xF) << 3)
2276 //uu               | (L << 2) | pp;
2277 //uu    } else {
2278 //uu       /* 3 byte encoding is needed. */
2279 //uu       byte0 = 0xC4;
2280 //uu       byte1 = ((rexR ^ 1) << 7) | ((rexX ^ 1) << 6)
2281 //uu               | ((rexB ^ 1) << 5) | mmmmm;
2282 //uu       byte2 = (rexW << 7) | ((notVvvv ^ 0xF) << 3) | (L << 2) | pp;
2283 //uu    }
2284 //uu    return (((UInt)byte2) << 16) | (((UInt)byte1) << 8) | ((UInt)byte0);
2285 //uu }
2286 //uu
2287 //uu /* Make up a VEX prefix for a (greg,amode) pair.  First byte in bits
2288 //uu    7:0 of result, second in 15:8, third (for a 3 byte prefix) in
2289 //uu    23:16.  Has m-mmmm set to indicate a prefix of 0F, pp set to
2290 //uu    indicate no SIMD prefix, W=0 (ignore), L=1 (size=256), and
2291 //uu    vvvv=1111 (unused 3rd reg). */
2292 //uu static UInt vexAMode_M ( HReg greg, AMD64AMode* am )
2293 //uu {
2294 //uu    UChar L       = 1; /* size = 256 */
2295 //uu    UChar pp      = 0; /* no SIMD prefix */
2296 //uu    UChar mmmmm   = 1; /* 0F */
2297 //uu    UChar notVvvv = 0; /* unused */
2298 //uu    UChar rexW    = 0;
2299 //uu    UChar rexR    = 0;
2300 //uu    UChar rexX    = 0;
2301 //uu    UChar rexB    = 0;
2302 //uu    /* Same logic as in rexAMode_M. */
2303 //uu    if (am->tag == Aam_IR) {
2304 //uu       rexR = iregEnc3(greg);
2305 //uu       rexX = 0; /* not relevant */
2306 //uu       rexB = iregEnc3(am->Aam.IR.reg);
2307 //uu    }
2308 //uu    else if (am->tag == Aam_IRRS) {
2309 //uu       rexR = iregEnc3(greg);
2310 //uu       rexX = iregEnc3(am->Aam.IRRS.index);
2311 //uu       rexB = iregEnc3(am->Aam.IRRS.base);
2312 //uu    } else {
2313 //uu       vassert(0);
2314 //uu    }
2315 //uu    return packVexPrefix( rexR, rexX, rexB, mmmmm, rexW, notVvvv, L, pp );
2316 //uu }
2317 //uu
2318 //uu static UChar* emitVexPrefix ( UChar* p, UInt vex )
2319 //uu {
2320 //uu    switch (vex & 0xFF) {
2321 //uu       case 0xC5:
2322 //uu          *p++ = 0xC5;
2323 //uu          *p++ = (vex >> 8) & 0xFF;
2324 //uu          vassert(0 == (vex >> 16));
2325 //uu          break;
2326 //uu       case 0xC4:
2327 //uu          *p++ = 0xC4;
2328 //uu          *p++ = (vex >> 8) & 0xFF;
2329 //uu          *p++ = (vex >> 16) & 0xFF;
2330 //uu          vassert(0 == (vex >> 24));
2331 //uu          break;
2332 //uu       default:
2333 //uu          vassert(0);
2334 //uu    }
2335 //uu    return p;
2336 //uu }
2337 
2338 
2339 /* Emit ffree %st(N) */
do_ffree_st(UChar * p,Int n)2340 static UChar* do_ffree_st ( UChar* p, Int n )
2341 {
2342    vassert(n >= 0 && n <= 7);
2343    *p++ = 0xDD;
2344    *p++ = toUChar(0xC0 + n);
2345    return p;
2346 }
2347 
2348 /* Emit an instruction into buf and return the number of bytes used.
2349    Note that buf is not the insn's final place, and therefore it is
2350    imperative to emit position-independent code.  If the emitted
2351    instruction was a profiler inc, set *is_profInc to True, else
2352    leave it unchanged. */
2353 
emit_AMD64Instr(Bool * is_profInc,UChar * buf,Int nbuf,const AMD64Instr * i,Bool mode64,VexEndness endness_host,const void * disp_cp_chain_me_to_slowEP,const void * disp_cp_chain_me_to_fastEP,const void * disp_cp_xindir,const void * disp_cp_xassisted)2354 Int emit_AMD64Instr ( /*MB_MOD*/Bool* is_profInc,
2355                       UChar* buf, Int nbuf, const AMD64Instr* i,
2356                       Bool mode64, VexEndness endness_host,
2357                       const void* disp_cp_chain_me_to_slowEP,
2358                       const void* disp_cp_chain_me_to_fastEP,
2359                       const void* disp_cp_xindir,
2360                       const void* disp_cp_xassisted )
2361 {
2362    UInt /*irno,*/ opc, opc_rr, subopc_imm, opc_imma, opc_cl, opc_imm, subopc;
2363    UInt   xtra;
2364    UInt   reg;
2365    UChar  rex;
2366    UChar* p = &buf[0];
2367    UChar* ptmp;
2368    Int    j;
2369    vassert(nbuf >= 32);
2370    vassert(mode64 == True);
2371 
2372    /* vex_printf("asm  "); ppAMD64Instr(i, mode64); vex_printf("\n"); */
2373 
2374    switch (i->tag) {
2375 
2376    case Ain_Imm64:
2377       if (i->Ain.Imm64.imm64 <= 0xFFFFFULL) {
2378          /* Use the short form (load into 32 bit reg, + default
2379             widening rule) for constants under 1 million.  We could
2380             use this form for the range 0 to 0x7FFFFFFF inclusive, but
2381             limit it to a smaller range for verifiability purposes. */
2382          if (1 & iregEnc3(i->Ain.Imm64.dst))
2383             *p++ = 0x41;
2384          *p++ = 0xB8 + iregEnc210(i->Ain.Imm64.dst);
2385          p = emit32(p, (UInt)i->Ain.Imm64.imm64);
2386       } else {
2387          *p++ = toUChar(0x48 + (1 & iregEnc3(i->Ain.Imm64.dst)));
2388          *p++ = toUChar(0xB8 + iregEnc210(i->Ain.Imm64.dst));
2389          p = emit64(p, i->Ain.Imm64.imm64);
2390       }
2391       goto done;
2392 
2393    case Ain_Alu64R:
2394       /* Deal specially with MOV */
2395       if (i->Ain.Alu64R.op == Aalu_MOV) {
2396          switch (i->Ain.Alu64R.src->tag) {
2397             case Armi_Imm:
2398                if (0 == (i->Ain.Alu64R.src->Armi.Imm.imm32 & ~0xFFFFF)) {
2399                   /* Actually we could use this form for constants in
2400                      the range 0 through 0x7FFFFFFF inclusive, but
2401                      limit it to a small range for verifiability
2402                      purposes. */
2403                   /* Generate "movl $imm32, 32-bit-register" and let
2404                      the default zero-extend rule cause the upper half
2405                      of the dst to be zeroed out too.  This saves 1
2406                      and sometimes 2 bytes compared to the more
2407                      obvious encoding in the 'else' branch. */
2408                   if (1 & iregEnc3(i->Ain.Alu64R.dst))
2409                      *p++ = 0x41;
2410                   *p++ = 0xB8 + iregEnc210(i->Ain.Alu64R.dst);
2411                   p = emit32(p, i->Ain.Alu64R.src->Armi.Imm.imm32);
2412                } else {
2413                   *p++ = toUChar(0x48 + (1 & iregEnc3(i->Ain.Alu64R.dst)));
2414                   *p++ = 0xC7;
2415                   *p++ = toUChar(0xC0 + iregEnc210(i->Ain.Alu64R.dst));
2416                   p = emit32(p, i->Ain.Alu64R.src->Armi.Imm.imm32);
2417                }
2418                goto done;
2419             case Armi_Reg:
2420                *p++ = rexAMode_R( i->Ain.Alu64R.src->Armi.Reg.reg,
2421                                   i->Ain.Alu64R.dst );
2422                *p++ = 0x89;
2423                p = doAMode_R(p, i->Ain.Alu64R.src->Armi.Reg.reg,
2424                                 i->Ain.Alu64R.dst);
2425                goto done;
2426             case Armi_Mem:
2427                *p++ = rexAMode_M(i->Ain.Alu64R.dst,
2428                                  i->Ain.Alu64R.src->Armi.Mem.am);
2429                *p++ = 0x8B;
2430                p = doAMode_M(p, i->Ain.Alu64R.dst,
2431                                 i->Ain.Alu64R.src->Armi.Mem.am);
2432                goto done;
2433             default:
2434                goto bad;
2435          }
2436       }
2437       /* MUL */
2438       if (i->Ain.Alu64R.op == Aalu_MUL) {
2439          switch (i->Ain.Alu64R.src->tag) {
2440             case Armi_Reg:
2441                *p++ = rexAMode_R( i->Ain.Alu64R.dst,
2442                                   i->Ain.Alu64R.src->Armi.Reg.reg);
2443                *p++ = 0x0F;
2444                *p++ = 0xAF;
2445                p = doAMode_R(p, i->Ain.Alu64R.dst,
2446                                 i->Ain.Alu64R.src->Armi.Reg.reg);
2447                goto done;
2448             case Armi_Mem:
2449                *p++ = rexAMode_M(i->Ain.Alu64R.dst,
2450                                  i->Ain.Alu64R.src->Armi.Mem.am);
2451                *p++ = 0x0F;
2452                *p++ = 0xAF;
2453                p = doAMode_M(p, i->Ain.Alu64R.dst,
2454                                 i->Ain.Alu64R.src->Armi.Mem.am);
2455                goto done;
2456             case Armi_Imm:
2457                if (fits8bits(i->Ain.Alu64R.src->Armi.Imm.imm32)) {
2458                   *p++ = rexAMode_R(i->Ain.Alu64R.dst, i->Ain.Alu64R.dst);
2459                   *p++ = 0x6B;
2460                   p = doAMode_R(p, i->Ain.Alu64R.dst, i->Ain.Alu64R.dst);
2461                   *p++ = toUChar(0xFF & i->Ain.Alu64R.src->Armi.Imm.imm32);
2462                } else {
2463                   *p++ = rexAMode_R(i->Ain.Alu64R.dst, i->Ain.Alu64R.dst);
2464                   *p++ = 0x69;
2465                   p = doAMode_R(p, i->Ain.Alu64R.dst, i->Ain.Alu64R.dst);
2466                   p = emit32(p, i->Ain.Alu64R.src->Armi.Imm.imm32);
2467                }
2468                goto done;
2469             default:
2470                goto bad;
2471          }
2472       }
2473       /* ADD/SUB/ADC/SBB/AND/OR/XOR/CMP */
2474       opc = opc_rr = subopc_imm = opc_imma = 0;
2475       switch (i->Ain.Alu64R.op) {
2476          case Aalu_ADC: opc = 0x13; opc_rr = 0x11;
2477                         subopc_imm = 2; opc_imma = 0x15; break;
2478          case Aalu_ADD: opc = 0x03; opc_rr = 0x01;
2479                         subopc_imm = 0; opc_imma = 0x05; break;
2480          case Aalu_SUB: opc = 0x2B; opc_rr = 0x29;
2481                         subopc_imm = 5; opc_imma = 0x2D; break;
2482          case Aalu_SBB: opc = 0x1B; opc_rr = 0x19;
2483                         subopc_imm = 3; opc_imma = 0x1D; break;
2484          case Aalu_AND: opc = 0x23; opc_rr = 0x21;
2485                         subopc_imm = 4; opc_imma = 0x25; break;
2486          case Aalu_XOR: opc = 0x33; opc_rr = 0x31;
2487                         subopc_imm = 6; opc_imma = 0x35; break;
2488          case Aalu_OR:  opc = 0x0B; opc_rr = 0x09;
2489                         subopc_imm = 1; opc_imma = 0x0D; break;
2490          case Aalu_CMP: opc = 0x3B; opc_rr = 0x39;
2491                         subopc_imm = 7; opc_imma = 0x3D; break;
2492          default: goto bad;
2493       }
2494       switch (i->Ain.Alu64R.src->tag) {
2495          case Armi_Imm:
2496             if (sameHReg(i->Ain.Alu64R.dst, hregAMD64_RAX())
2497                 && !fits8bits(i->Ain.Alu64R.src->Armi.Imm.imm32)) {
2498                goto bad; /* FIXME: awaiting test case */
2499                *p++ = toUChar(opc_imma);
2500                p = emit32(p, i->Ain.Alu64R.src->Armi.Imm.imm32);
2501             } else
2502             if (fits8bits(i->Ain.Alu64R.src->Armi.Imm.imm32)) {
2503                *p++ = rexAMode_R_enc_reg( 0, i->Ain.Alu64R.dst );
2504                *p++ = 0x83;
2505                p    = doAMode_R_enc_reg(p, subopc_imm, i->Ain.Alu64R.dst);
2506                *p++ = toUChar(0xFF & i->Ain.Alu64R.src->Armi.Imm.imm32);
2507             } else {
2508                *p++ = rexAMode_R_enc_reg( 0, i->Ain.Alu64R.dst);
2509                *p++ = 0x81;
2510                p    = doAMode_R_enc_reg(p, subopc_imm, i->Ain.Alu64R.dst);
2511                p    = emit32(p, i->Ain.Alu64R.src->Armi.Imm.imm32);
2512             }
2513             goto done;
2514          case Armi_Reg:
2515             *p++ = rexAMode_R( i->Ain.Alu64R.src->Armi.Reg.reg,
2516                                i->Ain.Alu64R.dst);
2517             *p++ = toUChar(opc_rr);
2518             p = doAMode_R(p, i->Ain.Alu64R.src->Armi.Reg.reg,
2519                              i->Ain.Alu64R.dst);
2520             goto done;
2521          case Armi_Mem:
2522             *p++ = rexAMode_M( i->Ain.Alu64R.dst,
2523                                i->Ain.Alu64R.src->Armi.Mem.am);
2524             *p++ = toUChar(opc);
2525             p = doAMode_M(p, i->Ain.Alu64R.dst,
2526                              i->Ain.Alu64R.src->Armi.Mem.am);
2527             goto done;
2528          default:
2529             goto bad;
2530       }
2531       break;
2532 
2533    case Ain_Alu64M:
2534       /* Deal specially with MOV */
2535       if (i->Ain.Alu64M.op == Aalu_MOV) {
2536          switch (i->Ain.Alu64M.src->tag) {
2537             case Ari_Reg:
2538                *p++ = rexAMode_M(i->Ain.Alu64M.src->Ari.Reg.reg,
2539                                  i->Ain.Alu64M.dst);
2540                *p++ = 0x89;
2541                p = doAMode_M(p, i->Ain.Alu64M.src->Ari.Reg.reg,
2542                                 i->Ain.Alu64M.dst);
2543                goto done;
2544             case Ari_Imm:
2545                *p++ = rexAMode_M_enc(0, i->Ain.Alu64M.dst);
2546                *p++ = 0xC7;
2547                p = doAMode_M_enc(p, 0, i->Ain.Alu64M.dst);
2548                p = emit32(p, i->Ain.Alu64M.src->Ari.Imm.imm32);
2549                goto done;
2550             default:
2551                goto bad;
2552          }
2553       }
2554       break;
2555 
2556    case Ain_Sh64:
2557       opc_cl = opc_imm = subopc = 0;
2558       switch (i->Ain.Sh64.op) {
2559          case Ash_SHR: opc_cl = 0xD3; opc_imm = 0xC1; subopc = 5; break;
2560          case Ash_SAR: opc_cl = 0xD3; opc_imm = 0xC1; subopc = 7; break;
2561          case Ash_SHL: opc_cl = 0xD3; opc_imm = 0xC1; subopc = 4; break;
2562          default: goto bad;
2563       }
2564       if (i->Ain.Sh64.src == 0) {
2565          *p++ = rexAMode_R_enc_reg(0, i->Ain.Sh64.dst);
2566          *p++ = toUChar(opc_cl);
2567          p = doAMode_R_enc_reg(p, subopc, i->Ain.Sh64.dst);
2568          goto done;
2569       } else {
2570          *p++ = rexAMode_R_enc_reg(0, i->Ain.Sh64.dst);
2571          *p++ = toUChar(opc_imm);
2572          p = doAMode_R_enc_reg(p, subopc, i->Ain.Sh64.dst);
2573          *p++ = (UChar)(i->Ain.Sh64.src);
2574          goto done;
2575       }
2576       break;
2577 
2578    case Ain_Test64:
2579       /* testq sign-extend($imm32), %reg */
2580       *p++ = rexAMode_R_enc_reg(0, i->Ain.Test64.dst);
2581       *p++ = 0xF7;
2582       p = doAMode_R_enc_reg(p, 0, i->Ain.Test64.dst);
2583       p = emit32(p, i->Ain.Test64.imm32);
2584       goto done;
2585 
2586    case Ain_Unary64:
2587       if (i->Ain.Unary64.op == Aun_NOT) {
2588          *p++ = rexAMode_R_enc_reg(0, i->Ain.Unary64.dst);
2589          *p++ = 0xF7;
2590          p = doAMode_R_enc_reg(p, 2, i->Ain.Unary64.dst);
2591          goto done;
2592       }
2593       if (i->Ain.Unary64.op == Aun_NEG) {
2594          *p++ = rexAMode_R_enc_reg(0, i->Ain.Unary64.dst);
2595          *p++ = 0xF7;
2596          p = doAMode_R_enc_reg(p, 3, i->Ain.Unary64.dst);
2597          goto done;
2598       }
2599       break;
2600 
2601    case Ain_Lea64:
2602       *p++ = rexAMode_M(i->Ain.Lea64.dst, i->Ain.Lea64.am);
2603       *p++ = 0x8D;
2604       p = doAMode_M(p, i->Ain.Lea64.dst, i->Ain.Lea64.am);
2605       goto done;
2606 
2607    case Ain_Alu32R:
2608       /* ADD/SUB/AND/OR/XOR/CMP */
2609       opc = opc_rr = subopc_imm = opc_imma = 0;
2610       switch (i->Ain.Alu32R.op) {
2611          case Aalu_ADD: opc = 0x03; opc_rr = 0x01;
2612                         subopc_imm = 0; opc_imma = 0x05; break;
2613          case Aalu_SUB: opc = 0x2B; opc_rr = 0x29;
2614                         subopc_imm = 5; opc_imma = 0x2D; break;
2615          case Aalu_AND: opc = 0x23; opc_rr = 0x21;
2616                         subopc_imm = 4; opc_imma = 0x25; break;
2617          case Aalu_XOR: opc = 0x33; opc_rr = 0x31;
2618                         subopc_imm = 6; opc_imma = 0x35; break;
2619          case Aalu_OR:  opc = 0x0B; opc_rr = 0x09;
2620                         subopc_imm = 1; opc_imma = 0x0D; break;
2621          case Aalu_CMP: opc = 0x3B; opc_rr = 0x39;
2622                         subopc_imm = 7; opc_imma = 0x3D; break;
2623          default: goto bad;
2624       }
2625       switch (i->Ain.Alu32R.src->tag) {
2626          case Armi_Imm:
2627             if (sameHReg(i->Ain.Alu32R.dst, hregAMD64_RAX())
2628                 && !fits8bits(i->Ain.Alu32R.src->Armi.Imm.imm32)) {
2629                goto bad; /* FIXME: awaiting test case */
2630                *p++ = toUChar(opc_imma);
2631                p = emit32(p, i->Ain.Alu32R.src->Armi.Imm.imm32);
2632             } else
2633             if (fits8bits(i->Ain.Alu32R.src->Armi.Imm.imm32)) {
2634                rex  = clearWBit( rexAMode_R_enc_reg( 0, i->Ain.Alu32R.dst ) );
2635                if (rex != 0x40) *p++ = rex;
2636                *p++ = 0x83;
2637                p    = doAMode_R_enc_reg(p, subopc_imm, i->Ain.Alu32R.dst);
2638                *p++ = toUChar(0xFF & i->Ain.Alu32R.src->Armi.Imm.imm32);
2639             } else {
2640                rex  = clearWBit( rexAMode_R_enc_reg( 0, i->Ain.Alu32R.dst) );
2641                if (rex != 0x40) *p++ = rex;
2642                *p++ = 0x81;
2643                p    = doAMode_R_enc_reg(p, subopc_imm, i->Ain.Alu32R.dst);
2644                p    = emit32(p, i->Ain.Alu32R.src->Armi.Imm.imm32);
2645             }
2646             goto done;
2647          case Armi_Reg:
2648             rex  = clearWBit(
2649                    rexAMode_R( i->Ain.Alu32R.src->Armi.Reg.reg,
2650                                i->Ain.Alu32R.dst) );
2651             if (rex != 0x40) *p++ = rex;
2652             *p++ = toUChar(opc_rr);
2653             p = doAMode_R(p, i->Ain.Alu32R.src->Armi.Reg.reg,
2654                              i->Ain.Alu32R.dst);
2655             goto done;
2656          case Armi_Mem:
2657             rex  = clearWBit(
2658                    rexAMode_M( i->Ain.Alu32R.dst,
2659                                i->Ain.Alu32R.src->Armi.Mem.am) );
2660             if (rex != 0x40) *p++ = rex;
2661             *p++ = toUChar(opc);
2662             p = doAMode_M(p, i->Ain.Alu32R.dst,
2663                              i->Ain.Alu32R.src->Armi.Mem.am);
2664             goto done;
2665          default:
2666             goto bad;
2667       }
2668       break;
2669 
2670    case Ain_MulL:
2671       subopc = i->Ain.MulL.syned ? 5 : 4;
2672       switch (i->Ain.MulL.src->tag)  {
2673          case Arm_Mem:
2674             *p++ = rexAMode_M_enc(0, i->Ain.MulL.src->Arm.Mem.am);
2675             *p++ = 0xF7;
2676             p = doAMode_M_enc(p, subopc, i->Ain.MulL.src->Arm.Mem.am);
2677             goto done;
2678          case Arm_Reg:
2679             *p++ = rexAMode_R_enc_reg(0, i->Ain.MulL.src->Arm.Reg.reg);
2680             *p++ = 0xF7;
2681             p = doAMode_R_enc_reg(p, subopc, i->Ain.MulL.src->Arm.Reg.reg);
2682             goto done;
2683          default:
2684             goto bad;
2685       }
2686       break;
2687 
2688    case Ain_Div:
2689       subopc = i->Ain.Div.syned ? 7 : 6;
2690       if (i->Ain.Div.sz == 4) {
2691          switch (i->Ain.Div.src->tag)  {
2692             case Arm_Mem:
2693                goto bad;
2694                /*FIXME*/
2695                *p++ = 0xF7;
2696                p = doAMode_M_enc(p, subopc, i->Ain.Div.src->Arm.Mem.am);
2697                goto done;
2698             case Arm_Reg:
2699                *p++ = clearWBit(
2700                       rexAMode_R_enc_reg(0, i->Ain.Div.src->Arm.Reg.reg));
2701                *p++ = 0xF7;
2702                p = doAMode_R_enc_reg(p, subopc, i->Ain.Div.src->Arm.Reg.reg);
2703                goto done;
2704             default:
2705                goto bad;
2706          }
2707       }
2708       if (i->Ain.Div.sz == 8) {
2709          switch (i->Ain.Div.src->tag)  {
2710             case Arm_Mem:
2711                *p++ = rexAMode_M_enc(0, i->Ain.Div.src->Arm.Mem.am);
2712                *p++ = 0xF7;
2713                p = doAMode_M_enc(p, subopc, i->Ain.Div.src->Arm.Mem.am);
2714                goto done;
2715             case Arm_Reg:
2716                *p++ = rexAMode_R_enc_reg(0, i->Ain.Div.src->Arm.Reg.reg);
2717                *p++ = 0xF7;
2718                p = doAMode_R_enc_reg(p, subopc, i->Ain.Div.src->Arm.Reg.reg);
2719                goto done;
2720             default:
2721                goto bad;
2722          }
2723       }
2724       break;
2725 
2726    case Ain_Push:
2727       switch (i->Ain.Push.src->tag) {
2728          case Armi_Mem:
2729             *p++ = clearWBit(
2730                    rexAMode_M_enc(0, i->Ain.Push.src->Armi.Mem.am));
2731             *p++ = 0xFF;
2732             p = doAMode_M_enc(p, 6, i->Ain.Push.src->Armi.Mem.am);
2733             goto done;
2734          case Armi_Imm:
2735             *p++ = 0x68;
2736             p = emit32(p, i->Ain.Push.src->Armi.Imm.imm32);
2737             goto done;
2738          case Armi_Reg:
2739             *p++ = toUChar(0x40 + (1 & iregEnc3(i->Ain.Push.src->Armi.Reg.reg)));
2740             *p++ = toUChar(0x50 + iregEnc210(i->Ain.Push.src->Armi.Reg.reg));
2741             goto done;
2742         default:
2743             goto bad;
2744       }
2745 
2746    case Ain_Call: {
2747       /* As per detailed comment for Ain_Call in getRegUsage_AMD64Instr
2748          above, %r11 is used as an address temporary. */
2749       /* If we don't need to do any fixup actions in the case that the
2750          call doesn't happen, just do the simple thing and emit
2751          straight-line code.  This is usually the case. */
2752       if (i->Ain.Call.cond == Acc_ALWAYS/*call always happens*/
2753           || i->Ain.Call.rloc.pri == RLPri_None/*no fixup action*/) {
2754          /* jump over the following two insns if the condition does
2755             not hold */
2756          Bool shortImm = fitsIn32Bits(i->Ain.Call.target);
2757          if (i->Ain.Call.cond != Acc_ALWAYS) {
2758             *p++ = toUChar(0x70 + (0xF & (i->Ain.Call.cond ^ 1)));
2759             *p++ = shortImm ? 10 : 13;
2760             /* 10 or 13 bytes in the next two insns */
2761          }
2762          if (shortImm) {
2763             /* 7 bytes: movl sign-extend(imm32), %r11 */
2764             *p++ = 0x49;
2765             *p++ = 0xC7;
2766             *p++ = 0xC3;
2767             p = emit32(p, (UInt)i->Ain.Call.target);
2768          } else {
2769             /* 10 bytes: movabsq $target, %r11 */
2770             *p++ = 0x49;
2771             *p++ = 0xBB;
2772             p = emit64(p, i->Ain.Call.target);
2773          }
2774          /* 3 bytes: call *%r11 */
2775          *p++ = 0x41;
2776          *p++ = 0xFF;
2777          *p++ = 0xD3;
2778       } else {
2779          Int delta;
2780          /* Complex case.  We have to generate an if-then-else diamond. */
2781          // before:
2782          //   j{!cond} else:
2783          //   movabsq $target, %r11
2784          //   call* %r11
2785          // preElse:
2786          //   jmp after:
2787          // else:
2788          //   movabsq $0x5555555555555555, %rax  // possibly
2789          //   movq %rax, %rdx                    // possibly
2790          // after:
2791 
2792          // before:
2793          UChar* pBefore = p;
2794 
2795          //   j{!cond} else:
2796          *p++ = toUChar(0x70 + (0xF & (i->Ain.Call.cond ^ 1)));
2797          *p++ = 0; /* # of bytes to jump over; don't know how many yet. */
2798 
2799          //   movabsq $target, %r11
2800          *p++ = 0x49;
2801          *p++ = 0xBB;
2802          p = emit64(p, i->Ain.Call.target);
2803 
2804          //   call* %r11
2805          *p++ = 0x41;
2806          *p++ = 0xFF;
2807          *p++ = 0xD3;
2808 
2809          // preElse:
2810          UChar* pPreElse = p;
2811 
2812          //   jmp after:
2813          *p++ = 0xEB;
2814          *p++ = 0; /* # of bytes to jump over; don't know how many yet. */
2815 
2816          // else:
2817          UChar* pElse = p;
2818 
2819          /* Do the 'else' actions */
2820          switch (i->Ain.Call.rloc.pri) {
2821             case RLPri_Int:
2822                // movabsq $0x5555555555555555, %rax
2823                *p++ = 0x48; *p++ = 0xB8; p = emit64(p, 0x5555555555555555ULL);
2824                break;
2825             case RLPri_2Int:
2826                vassert(0); //ATC
2827                // movabsq $0x5555555555555555, %rax
2828                *p++ = 0x48; *p++ = 0xB8; p = emit64(p, 0x5555555555555555ULL);
2829                // movq %rax, %rdx
2830                *p++ = 0x48; *p++ = 0x89; *p++ = 0xC2;
2831             case RLPri_None: case RLPri_INVALID: default:
2832                vassert(0);
2833          }
2834 
2835          // after:
2836          UChar* pAfter = p;
2837 
2838          // Fix up the branch offsets.  The +2s in the offset
2839          // calculations are there because x86 requires conditional
2840          // branches to have their offset stated relative to the
2841          // instruction immediately following the branch insn.  And in
2842          // both cases the branch insns are 2 bytes long.
2843 
2844          // First, the "j{!cond} else:" at pBefore.
2845          delta = (Int)(Long)(pElse - (pBefore + 2));
2846          vassert(delta >= 0 && delta < 100/*arbitrary*/);
2847          *(pBefore+1) = (UChar)delta;
2848 
2849          // And secondly, the "jmp after:" at pPreElse.
2850          delta = (Int)(Long)(pAfter - (pPreElse + 2));
2851          vassert(delta >= 0 && delta < 100/*arbitrary*/);
2852          *(pPreElse+1) = (UChar)delta;
2853       }
2854       goto done;
2855    }
2856 
2857    case Ain_XDirect: {
2858       /* NB: what goes on here has to be very closely coordinated with the
2859          chainXDirect_AMD64 and unchainXDirect_AMD64 below. */
2860       /* We're generating chain-me requests here, so we need to be
2861          sure this is actually allowed -- no-redir translations can't
2862          use chain-me's.  Hence: */
2863       vassert(disp_cp_chain_me_to_slowEP != NULL);
2864       vassert(disp_cp_chain_me_to_fastEP != NULL);
2865 
2866       HReg r11 = hregAMD64_R11();
2867 
2868       /* Use ptmp for backpatching conditional jumps. */
2869       ptmp = NULL;
2870 
2871       /* First off, if this is conditional, create a conditional
2872          jump over the rest of it. */
2873       if (i->Ain.XDirect.cond != Acc_ALWAYS) {
2874          /* jmp fwds if !condition */
2875          *p++ = toUChar(0x70 + (0xF & (i->Ain.XDirect.cond ^ 1)));
2876          ptmp = p; /* fill in this bit later */
2877          *p++ = 0; /* # of bytes to jump over; don't know how many yet. */
2878       }
2879 
2880       /* Update the guest RIP. */
2881       if (fitsIn32Bits(i->Ain.XDirect.dstGA)) {
2882          /* use a shorter encoding */
2883          /* movl sign-extend(dstGA), %r11 */
2884          *p++ = 0x49;
2885          *p++ = 0xC7;
2886          *p++ = 0xC3;
2887          p = emit32(p, (UInt)i->Ain.XDirect.dstGA);
2888       } else {
2889          /* movabsq $dstGA, %r11 */
2890          *p++ = 0x49;
2891          *p++ = 0xBB;
2892          p = emit64(p, i->Ain.XDirect.dstGA);
2893       }
2894 
2895       /* movq %r11, amRIP */
2896       *p++ = rexAMode_M(r11, i->Ain.XDirect.amRIP);
2897       *p++ = 0x89;
2898       p = doAMode_M(p, r11, i->Ain.XDirect.amRIP);
2899 
2900       /* --- FIRST PATCHABLE BYTE follows --- */
2901       /* VG_(disp_cp_chain_me_to_{slowEP,fastEP}) (where we're calling
2902          to) backs up the return address, so as to find the address of
2903          the first patchable byte.  So: don't change the length of the
2904          two instructions below. */
2905       /* movabsq $disp_cp_chain_me_to_{slow,fast}EP,%r11; */
2906       *p++ = 0x49;
2907       *p++ = 0xBB;
2908       const void* disp_cp_chain_me
2909                = i->Ain.XDirect.toFastEP ? disp_cp_chain_me_to_fastEP
2910                                          : disp_cp_chain_me_to_slowEP;
2911       p = emit64(p, (Addr)disp_cp_chain_me);
2912       /* call *%r11 */
2913       *p++ = 0x41;
2914       *p++ = 0xFF;
2915       *p++ = 0xD3;
2916       /* --- END of PATCHABLE BYTES --- */
2917 
2918       /* Fix up the conditional jump, if there was one. */
2919       if (i->Ain.XDirect.cond != Acc_ALWAYS) {
2920          Int delta = p - ptmp;
2921          vassert(delta > 0 && delta < 40);
2922          *ptmp = toUChar(delta-1);
2923       }
2924       goto done;
2925    }
2926 
2927    case Ain_XIndir: {
2928       /* We're generating transfers that could lead indirectly to a
2929          chain-me, so we need to be sure this is actually allowed --
2930          no-redir translations are not allowed to reach normal
2931          translations without going through the scheduler.  That means
2932          no XDirects or XIndirs out from no-redir translations.
2933          Hence: */
2934       vassert(disp_cp_xindir != NULL);
2935 
2936       /* Use ptmp for backpatching conditional jumps. */
2937       ptmp = NULL;
2938 
2939       /* First off, if this is conditional, create a conditional
2940          jump over the rest of it. */
2941       if (i->Ain.XIndir.cond != Acc_ALWAYS) {
2942          /* jmp fwds if !condition */
2943          *p++ = toUChar(0x70 + (0xF & (i->Ain.XIndir.cond ^ 1)));
2944          ptmp = p; /* fill in this bit later */
2945          *p++ = 0; /* # of bytes to jump over; don't know how many yet. */
2946       }
2947 
2948       /* movq dstGA(a reg), amRIP -- copied from Alu64M MOV case */
2949       *p++ = rexAMode_M(i->Ain.XIndir.dstGA, i->Ain.XIndir.amRIP);
2950       *p++ = 0x89;
2951       p = doAMode_M(p, i->Ain.XIndir.dstGA, i->Ain.XIndir.amRIP);
2952 
2953       /* get $disp_cp_xindir into %r11 */
2954       if (fitsIn32Bits((Addr)disp_cp_xindir)) {
2955          /* use a shorter encoding */
2956          /* movl sign-extend(disp_cp_xindir), %r11 */
2957          *p++ = 0x49;
2958          *p++ = 0xC7;
2959          *p++ = 0xC3;
2960          p = emit32(p, (UInt)(Addr)disp_cp_xindir);
2961       } else {
2962          /* movabsq $disp_cp_xindir, %r11 */
2963          *p++ = 0x49;
2964          *p++ = 0xBB;
2965          p = emit64(p, (Addr)disp_cp_xindir);
2966       }
2967 
2968       /* jmp *%r11 */
2969       *p++ = 0x41;
2970       *p++ = 0xFF;
2971       *p++ = 0xE3;
2972 
2973       /* Fix up the conditional jump, if there was one. */
2974       if (i->Ain.XIndir.cond != Acc_ALWAYS) {
2975          Int delta = p - ptmp;
2976          vassert(delta > 0 && delta < 40);
2977          *ptmp = toUChar(delta-1);
2978       }
2979       goto done;
2980    }
2981 
2982    case Ain_XAssisted: {
2983       /* Use ptmp for backpatching conditional jumps. */
2984       ptmp = NULL;
2985 
2986       /* First off, if this is conditional, create a conditional
2987          jump over the rest of it. */
2988       if (i->Ain.XAssisted.cond != Acc_ALWAYS) {
2989          /* jmp fwds if !condition */
2990          *p++ = toUChar(0x70 + (0xF & (i->Ain.XAssisted.cond ^ 1)));
2991          ptmp = p; /* fill in this bit later */
2992          *p++ = 0; /* # of bytes to jump over; don't know how many yet. */
2993       }
2994 
2995       /* movq dstGA(a reg), amRIP -- copied from Alu64M MOV case */
2996       *p++ = rexAMode_M(i->Ain.XAssisted.dstGA, i->Ain.XAssisted.amRIP);
2997       *p++ = 0x89;
2998       p = doAMode_M(p, i->Ain.XAssisted.dstGA, i->Ain.XAssisted.amRIP);
2999       /* movl $magic_number, %ebp.  Since these numbers are all small positive
3000          integers, we can get away with "movl $N, %ebp" rather than
3001          the longer "movq $N, %rbp". */
3002       UInt trcval = 0;
3003       switch (i->Ain.XAssisted.jk) {
3004          case Ijk_ClientReq:   trcval = VEX_TRC_JMP_CLIENTREQ;   break;
3005          case Ijk_Sys_syscall: trcval = VEX_TRC_JMP_SYS_SYSCALL; break;
3006          case Ijk_Sys_int32:   trcval = VEX_TRC_JMP_SYS_INT32;   break;
3007          case Ijk_Yield:       trcval = VEX_TRC_JMP_YIELD;       break;
3008          case Ijk_EmWarn:      trcval = VEX_TRC_JMP_EMWARN;      break;
3009          case Ijk_MapFail:     trcval = VEX_TRC_JMP_MAPFAIL;     break;
3010          case Ijk_NoDecode:    trcval = VEX_TRC_JMP_NODECODE;    break;
3011          case Ijk_InvalICache: trcval = VEX_TRC_JMP_INVALICACHE; break;
3012          case Ijk_NoRedir:     trcval = VEX_TRC_JMP_NOREDIR;     break;
3013          case Ijk_SigTRAP:     trcval = VEX_TRC_JMP_SIGTRAP;     break;
3014          case Ijk_SigSEGV:     trcval = VEX_TRC_JMP_SIGSEGV;     break;
3015          case Ijk_Boring:      trcval = VEX_TRC_JMP_BORING;      break;
3016          /* We don't expect to see the following being assisted. */
3017          case Ijk_Ret:
3018          case Ijk_Call:
3019          /* fallthrough */
3020          default:
3021             ppIRJumpKind(i->Ain.XAssisted.jk);
3022             vpanic("emit_AMD64Instr.Ain_XAssisted: unexpected jump kind");
3023       }
3024       vassert(trcval != 0);
3025       *p++ = 0xBD;
3026       p = emit32(p, trcval);
3027       /* movabsq $disp_assisted, %r11 */
3028       *p++ = 0x49;
3029       *p++ = 0xBB;
3030       p = emit64(p, (Addr)disp_cp_xassisted);
3031       /* jmp *%r11 */
3032       *p++ = 0x41;
3033       *p++ = 0xFF;
3034       *p++ = 0xE3;
3035 
3036       /* Fix up the conditional jump, if there was one. */
3037       if (i->Ain.XAssisted.cond != Acc_ALWAYS) {
3038          Int delta = p - ptmp;
3039          vassert(delta > 0 && delta < 40);
3040          *ptmp = toUChar(delta-1);
3041       }
3042       goto done;
3043    }
3044 
3045    case Ain_CMov64:
3046       vassert(i->Ain.CMov64.cond != Acc_ALWAYS);
3047       *p++ = rexAMode_R(i->Ain.CMov64.dst, i->Ain.CMov64.src);
3048       *p++ = 0x0F;
3049       *p++ = toUChar(0x40 + (0xF & i->Ain.CMov64.cond));
3050       p = doAMode_R(p, i->Ain.CMov64.dst, i->Ain.CMov64.src);
3051       goto done;
3052 
3053    case Ain_CLoad: {
3054       vassert(i->Ain.CLoad.cond != Acc_ALWAYS);
3055 
3056       /* Only 32- or 64-bit variants are allowed. */
3057       vassert(i->Ain.CLoad.szB == 4 || i->Ain.CLoad.szB == 8);
3058 
3059       /* Use ptmp for backpatching conditional jumps. */
3060       ptmp = NULL;
3061 
3062       /* jmp fwds if !condition */
3063       *p++ = toUChar(0x70 + (0xF & (i->Ain.CLoad.cond ^ 1)));
3064       ptmp = p; /* fill in this bit later */
3065       *p++ = 0; /* # of bytes to jump over; don't know how many yet. */
3066 
3067       /* Now the load.  Either a normal 64 bit load or a normal 32 bit
3068          load, which, by the default zero-extension rule, zeroes out
3069          the upper half of the destination, as required. */
3070       rex = rexAMode_M(i->Ain.CLoad.dst, i->Ain.CLoad.addr);
3071       *p++ = i->Ain.CLoad.szB == 4 ? clearWBit(rex) : rex;
3072       *p++ = 0x8B;
3073       p = doAMode_M(p, i->Ain.CLoad.dst, i->Ain.CLoad.addr);
3074 
3075       /* Fix up the conditional branch */
3076       Int delta = p - ptmp;
3077       vassert(delta > 0 && delta < 40);
3078       *ptmp = toUChar(delta-1);
3079       goto done;
3080    }
3081 
3082    case Ain_CStore: {
3083       /* AFAICS this is identical to Ain_CStore except that the opcode
3084          is 0x89 instead of 0x8B. */
3085       vassert(i->Ain.CStore.cond != Acc_ALWAYS);
3086 
3087       /* Only 32- or 64-bit variants are allowed. */
3088       vassert(i->Ain.CStore.szB == 4 || i->Ain.CStore.szB == 8);
3089 
3090       /* Use ptmp for backpatching conditional jumps. */
3091       ptmp = NULL;
3092 
3093       /* jmp fwds if !condition */
3094       *p++ = toUChar(0x70 + (0xF & (i->Ain.CStore.cond ^ 1)));
3095       ptmp = p; /* fill in this bit later */
3096       *p++ = 0; /* # of bytes to jump over; don't know how many yet. */
3097 
3098       /* Now the store. */
3099       rex = rexAMode_M(i->Ain.CStore.src, i->Ain.CStore.addr);
3100       *p++ = i->Ain.CStore.szB == 4 ? clearWBit(rex) : rex;
3101       *p++ = 0x89;
3102       p = doAMode_M(p, i->Ain.CStore.src, i->Ain.CStore.addr);
3103 
3104       /* Fix up the conditional branch */
3105       Int delta = p - ptmp;
3106       vassert(delta > 0 && delta < 40);
3107       *ptmp = toUChar(delta-1);
3108       goto done;
3109    }
3110 
3111    case Ain_MovxLQ:
3112       /* No, _don't_ ask me why the sense of the args has to be
3113          different in the S vs Z case.  I don't know. */
3114       if (i->Ain.MovxLQ.syned) {
3115          /* Need REX.W = 1 here, but rexAMode_R does that for us. */
3116          *p++ = rexAMode_R(i->Ain.MovxLQ.dst, i->Ain.MovxLQ.src);
3117          *p++ = 0x63;
3118          p = doAMode_R(p, i->Ain.MovxLQ.dst, i->Ain.MovxLQ.src);
3119       } else {
3120          /* Produce a 32-bit reg-reg move, since the implicit
3121             zero-extend does what we want. */
3122          *p++ = clearWBit (
3123                    rexAMode_R(i->Ain.MovxLQ.src, i->Ain.MovxLQ.dst));
3124          *p++ = 0x89;
3125          p = doAMode_R(p, i->Ain.MovxLQ.src, i->Ain.MovxLQ.dst);
3126       }
3127       goto done;
3128 
3129    case Ain_LoadEX:
3130       if (i->Ain.LoadEX.szSmall == 1 && !i->Ain.LoadEX.syned) {
3131          /* movzbq */
3132          *p++ = rexAMode_M(i->Ain.LoadEX.dst, i->Ain.LoadEX.src);
3133          *p++ = 0x0F;
3134          *p++ = 0xB6;
3135          p = doAMode_M(p, i->Ain.LoadEX.dst, i->Ain.LoadEX.src);
3136          goto done;
3137       }
3138       if (i->Ain.LoadEX.szSmall == 2 && !i->Ain.LoadEX.syned) {
3139          /* movzwq */
3140          *p++ = rexAMode_M(i->Ain.LoadEX.dst, i->Ain.LoadEX.src);
3141          *p++ = 0x0F;
3142          *p++ = 0xB7;
3143          p = doAMode_M(p, i->Ain.LoadEX.dst, i->Ain.LoadEX.src);
3144          goto done;
3145       }
3146       if (i->Ain.LoadEX.szSmall == 4 && !i->Ain.LoadEX.syned) {
3147          /* movzlq */
3148          /* This isn't really an existing AMD64 instruction per se.
3149             Rather, we have to do a 32-bit load.  Because a 32-bit
3150             write implicitly clears the upper 32 bits of the target
3151             register, we get what we want. */
3152          *p++ = clearWBit(
3153                 rexAMode_M(i->Ain.LoadEX.dst, i->Ain.LoadEX.src));
3154          *p++ = 0x8B;
3155          p = doAMode_M(p, i->Ain.LoadEX.dst, i->Ain.LoadEX.src);
3156          goto done;
3157       }
3158       break;
3159 
3160    case Ain_Set64:
3161       /* Make the destination register be 1 or 0, depending on whether
3162          the relevant condition holds.  Complication: the top 56 bits
3163          of the destination should be forced to zero, but doing 'xorq
3164          %r,%r' kills the flag(s) we are about to read.  Sigh.  So
3165          start off my moving $0 into the dest. */
3166       reg = iregEnc3210(i->Ain.Set64.dst);
3167       vassert(reg < 16);
3168 
3169       /* movq $0, %dst */
3170       *p++ = toUChar(reg >= 8 ? 0x49 : 0x48);
3171       *p++ = 0xC7;
3172       *p++ = toUChar(0xC0 + (reg & 7));
3173       p = emit32(p, 0);
3174 
3175       /* setb lo8(%dst) */
3176       /* note, 8-bit register rex trickyness.  Be careful here. */
3177       *p++ = toUChar(reg >= 8 ? 0x41 : 0x40);
3178       *p++ = 0x0F;
3179       *p++ = toUChar(0x90 + (0x0F & i->Ain.Set64.cond));
3180       *p++ = toUChar(0xC0 + (reg & 7));
3181       goto done;
3182 
3183    case Ain_Bsfr64:
3184       *p++ = rexAMode_R(i->Ain.Bsfr64.dst, i->Ain.Bsfr64.src);
3185       *p++ = 0x0F;
3186       if (i->Ain.Bsfr64.isFwds) {
3187          *p++ = 0xBC;
3188       } else {
3189          *p++ = 0xBD;
3190       }
3191       p = doAMode_R(p, i->Ain.Bsfr64.dst, i->Ain.Bsfr64.src);
3192       goto done;
3193 
3194    case Ain_MFence:
3195       /* mfence */
3196       *p++ = 0x0F; *p++ = 0xAE; *p++ = 0xF0;
3197       goto done;
3198 
3199    case Ain_ACAS:
3200       /* lock */
3201       *p++ = 0xF0;
3202       if (i->Ain.ACAS.sz == 2) *p++ = 0x66;
3203       /* cmpxchg{b,w,l,q} %rbx,mem.  Expected-value in %rax, new value
3204          in %rbx.  The new-value register is hardwired to be %rbx
3205          since dealing with byte integer registers is too much hassle,
3206          so we force the register operand to %rbx (could equally be
3207          %rcx or %rdx). */
3208       rex = rexAMode_M( hregAMD64_RBX(), i->Ain.ACAS.addr );
3209       if (i->Ain.ACAS.sz != 8)
3210          rex = clearWBit(rex);
3211 
3212       *p++ = rex; /* this can emit 0x40, which is pointless. oh well. */
3213       *p++ = 0x0F;
3214       if (i->Ain.ACAS.sz == 1) *p++ = 0xB0; else *p++ = 0xB1;
3215       p = doAMode_M(p, hregAMD64_RBX(), i->Ain.ACAS.addr);
3216       goto done;
3217 
3218    case Ain_DACAS:
3219       /* lock */
3220       *p++ = 0xF0;
3221       /* cmpxchg{8,16}b m{64,128}.  Expected-value in %rdx:%rax, new
3222          value in %rcx:%rbx.  All 4 regs are hardwired in the ISA, so
3223          aren't encoded in the insn. */
3224       rex = rexAMode_M_enc(1, i->Ain.ACAS.addr );
3225       if (i->Ain.ACAS.sz != 8)
3226          rex = clearWBit(rex);
3227       *p++ = rex;
3228       *p++ = 0x0F;
3229       *p++ = 0xC7;
3230       p = doAMode_M_enc(p, 1, i->Ain.DACAS.addr);
3231       goto done;
3232 
3233    case Ain_A87Free:
3234       vassert(i->Ain.A87Free.nregs > 0 && i->Ain.A87Free.nregs <= 7);
3235       for (j = 0; j < i->Ain.A87Free.nregs; j++) {
3236          p = do_ffree_st(p, 7-j);
3237       }
3238       goto done;
3239 
3240    case Ain_A87PushPop:
3241       vassert(i->Ain.A87PushPop.szB == 8 || i->Ain.A87PushPop.szB == 4);
3242       if (i->Ain.A87PushPop.isPush) {
3243          /* Load from memory into %st(0): flds/fldl amode */
3244          *p++ = clearWBit(
3245                    rexAMode_M_enc(0, i->Ain.A87PushPop.addr) );
3246          *p++ = i->Ain.A87PushPop.szB == 4 ? 0xD9 : 0xDD;
3247 	 p = doAMode_M_enc(p, 0/*subopcode*/, i->Ain.A87PushPop.addr);
3248       } else {
3249          /* Dump %st(0) to memory: fstps/fstpl amode */
3250          *p++ = clearWBit(
3251                    rexAMode_M_enc(3, i->Ain.A87PushPop.addr) );
3252          *p++ = i->Ain.A87PushPop.szB == 4 ? 0xD9 : 0xDD;
3253          p = doAMode_M_enc(p, 3/*subopcode*/, i->Ain.A87PushPop.addr);
3254          goto done;
3255       }
3256       goto done;
3257 
3258    case Ain_A87FpOp:
3259       switch (i->Ain.A87FpOp.op) {
3260          case Afp_SQRT:   *p++ = 0xD9; *p++ = 0xFA; break;
3261          case Afp_SIN:    *p++ = 0xD9; *p++ = 0xFE; break;
3262          case Afp_COS:    *p++ = 0xD9; *p++ = 0xFF; break;
3263          case Afp_ROUND:  *p++ = 0xD9; *p++ = 0xFC; break;
3264          case Afp_2XM1:   *p++ = 0xD9; *p++ = 0xF0; break;
3265          case Afp_SCALE:  *p++ = 0xD9; *p++ = 0xFD; break;
3266          case Afp_ATAN:   *p++ = 0xD9; *p++ = 0xF3; break;
3267          case Afp_YL2X:   *p++ = 0xD9; *p++ = 0xF1; break;
3268          case Afp_YL2XP1: *p++ = 0xD9; *p++ = 0xF9; break;
3269          case Afp_PREM:   *p++ = 0xD9; *p++ = 0xF8; break;
3270          case Afp_PREM1:  *p++ = 0xD9; *p++ = 0xF5; break;
3271          case Afp_TAN:
3272             /* fptan pushes 1.0 on the FP stack, except when the
3273                argument is out of range.  Hence we have to do the
3274                instruction, then inspect C2 to see if there is an out
3275                of range condition.  If there is, we skip the fincstp
3276                that is used by the in-range case to get rid of this
3277                extra 1.0 value. */
3278             *p++ = 0xD9; *p++ = 0xF2; // fptan
3279             *p++ = 0x50;              // pushq %rax
3280             *p++ = 0xDF; *p++ = 0xE0; // fnstsw %ax
3281             *p++ = 0x66; *p++ = 0xA9;
3282             *p++ = 0x00; *p++ = 0x04; // testw $0x400,%ax
3283             *p++ = 0x75; *p++ = 0x02; // jnz after_fincstp
3284             *p++ = 0xD9; *p++ = 0xF7; // fincstp
3285             *p++ = 0x58;              // after_fincstp: popq %rax
3286             break;
3287          default:
3288             goto bad;
3289       }
3290       goto done;
3291 
3292    case Ain_A87LdCW:
3293       *p++ = clearWBit(
3294                 rexAMode_M_enc(5, i->Ain.A87LdCW.addr) );
3295       *p++ = 0xD9;
3296       p = doAMode_M_enc(p, 5/*subopcode*/, i->Ain.A87LdCW.addr);
3297       goto done;
3298 
3299    case Ain_A87StSW:
3300       *p++ = clearWBit(
3301                 rexAMode_M_enc(7, i->Ain.A87StSW.addr) );
3302       *p++ = 0xDD;
3303       p = doAMode_M_enc(p, 7/*subopcode*/, i->Ain.A87StSW.addr);
3304       goto done;
3305 
3306    case Ain_Store:
3307       if (i->Ain.Store.sz == 2) {
3308          /* This just goes to show the crazyness of the instruction
3309             set encoding.  We have to insert two prefix bytes, but be
3310             careful to avoid a conflict in what the size should be, by
3311             ensuring that REX.W = 0. */
3312          *p++ = 0x66; /* override to 16-bits */
3313 	 *p++ = clearWBit( rexAMode_M( i->Ain.Store.src, i->Ain.Store.dst) );
3314          *p++ = 0x89;
3315          p = doAMode_M(p, i->Ain.Store.src, i->Ain.Store.dst);
3316          goto done;
3317       }
3318       if (i->Ain.Store.sz == 4) {
3319 	 *p++ = clearWBit( rexAMode_M( i->Ain.Store.src, i->Ain.Store.dst) );
3320          *p++ = 0x89;
3321          p = doAMode_M(p, i->Ain.Store.src, i->Ain.Store.dst);
3322          goto done;
3323       }
3324       if (i->Ain.Store.sz == 1) {
3325          /* This is one place where it would be wrong to skip emitting
3326             a rex byte of 0x40, since the mere presence of rex changes
3327             the meaning of the byte register access.  Be careful. */
3328 	 *p++ = clearWBit( rexAMode_M( i->Ain.Store.src, i->Ain.Store.dst) );
3329          *p++ = 0x88;
3330          p = doAMode_M(p, i->Ain.Store.src, i->Ain.Store.dst);
3331          goto done;
3332       }
3333       break;
3334 
3335    case Ain_LdMXCSR:
3336       *p++ = clearWBit(rexAMode_M_enc(0, i->Ain.LdMXCSR.addr));
3337       *p++ = 0x0F;
3338       *p++ = 0xAE;
3339       p = doAMode_M_enc(p, 2/*subopcode*/, i->Ain.LdMXCSR.addr);
3340       goto done;
3341 
3342    case Ain_SseUComIS:
3343       /* ucomi[sd] %srcL, %srcR ;  pushfq ; popq %dst */
3344       /* ucomi[sd] %srcL, %srcR */
3345       if (i->Ain.SseUComIS.sz == 8) {
3346          *p++ = 0x66;
3347       } else {
3348          goto bad;
3349          vassert(i->Ain.SseUComIS.sz == 4);
3350       }
3351       *p++ = clearWBit (
3352              rexAMode_R_enc_enc( vregEnc3210(i->Ain.SseUComIS.srcL),
3353                                  vregEnc3210(i->Ain.SseUComIS.srcR) ));
3354       *p++ = 0x0F;
3355       *p++ = 0x2E;
3356       p = doAMode_R_enc_enc(p, vregEnc3210(i->Ain.SseUComIS.srcL),
3357                                vregEnc3210(i->Ain.SseUComIS.srcR) );
3358       /* pushfq */
3359       *p++ = 0x9C;
3360       /* popq %dst */
3361       *p++ = toUChar(0x40 + (1 & iregEnc3(i->Ain.SseUComIS.dst)));
3362       *p++ = toUChar(0x58 + iregEnc210(i->Ain.SseUComIS.dst));
3363       goto done;
3364 
3365    case Ain_SseSI2SF:
3366       /* cvssi2s[sd] %src, %dst */
3367       rex = rexAMode_R_enc_reg( vregEnc3210(i->Ain.SseSI2SF.dst),
3368                                 i->Ain.SseSI2SF.src );
3369       *p++ = toUChar(i->Ain.SseSI2SF.szD==4 ? 0xF3 : 0xF2);
3370       *p++ = toUChar(i->Ain.SseSI2SF.szS==4 ? clearWBit(rex) : rex);
3371       *p++ = 0x0F;
3372       *p++ = 0x2A;
3373       p = doAMode_R_enc_reg( p, vregEnc3210(i->Ain.SseSI2SF.dst),
3374                                 i->Ain.SseSI2SF.src );
3375       goto done;
3376 
3377    case Ain_SseSF2SI:
3378       /* cvss[sd]2si %src, %dst */
3379       rex = rexAMode_R_reg_enc( i->Ain.SseSF2SI.dst,
3380                                 vregEnc3210(i->Ain.SseSF2SI.src) );
3381       *p++ = toUChar(i->Ain.SseSF2SI.szS==4 ? 0xF3 : 0xF2);
3382       *p++ = toUChar(i->Ain.SseSF2SI.szD==4 ? clearWBit(rex) : rex);
3383       *p++ = 0x0F;
3384       *p++ = 0x2D;
3385       p = doAMode_R_reg_enc( p, i->Ain.SseSF2SI.dst,
3386                                 vregEnc3210(i->Ain.SseSF2SI.src) );
3387       goto done;
3388 
3389    case Ain_SseSDSS:
3390       /* cvtsd2ss/cvtss2sd %src, %dst */
3391       *p++ = toUChar(i->Ain.SseSDSS.from64 ? 0xF2 : 0xF3);
3392       *p++ = clearWBit(
3393               rexAMode_R_enc_enc( vregEnc3210(i->Ain.SseSDSS.dst),
3394                                   vregEnc3210(i->Ain.SseSDSS.src) ));
3395       *p++ = 0x0F;
3396       *p++ = 0x5A;
3397       p = doAMode_R_enc_enc( p, vregEnc3210(i->Ain.SseSDSS.dst),
3398                                 vregEnc3210(i->Ain.SseSDSS.src) );
3399       goto done;
3400 
3401    case Ain_SseLdSt:
3402       if (i->Ain.SseLdSt.sz == 8) {
3403          *p++ = 0xF2;
3404       } else
3405       if (i->Ain.SseLdSt.sz == 4) {
3406          *p++ = 0xF3;
3407       } else
3408       if (i->Ain.SseLdSt.sz != 16) {
3409          vassert(0);
3410       }
3411       *p++ = clearWBit(
3412              rexAMode_M_enc(vregEnc3210(i->Ain.SseLdSt.reg),
3413                             i->Ain.SseLdSt.addr));
3414       *p++ = 0x0F;
3415       *p++ = toUChar(i->Ain.SseLdSt.isLoad ? 0x10 : 0x11);
3416       p = doAMode_M_enc(p, vregEnc3210(i->Ain.SseLdSt.reg),
3417                            i->Ain.SseLdSt.addr);
3418       goto done;
3419 
3420    case Ain_SseLdzLO:
3421       vassert(i->Ain.SseLdzLO.sz == 4 || i->Ain.SseLdzLO.sz == 8);
3422       /* movs[sd] amode, %xmm-dst */
3423       *p++ = toUChar(i->Ain.SseLdzLO.sz==4 ? 0xF3 : 0xF2);
3424       *p++ = clearWBit(
3425              rexAMode_M_enc(vregEnc3210(i->Ain.SseLdzLO.reg),
3426                             i->Ain.SseLdzLO.addr));
3427       *p++ = 0x0F;
3428       *p++ = 0x10;
3429       p = doAMode_M_enc(p, vregEnc3210(i->Ain.SseLdzLO.reg),
3430                            i->Ain.SseLdzLO.addr);
3431       goto done;
3432 
3433    case Ain_Sse32Fx4:
3434       xtra = 0;
3435       *p++ = clearWBit(
3436              rexAMode_R_enc_enc( vregEnc3210(i->Ain.Sse32Fx4.dst),
3437                                  vregEnc3210(i->Ain.Sse32Fx4.src) ));
3438       *p++ = 0x0F;
3439       switch (i->Ain.Sse32Fx4.op) {
3440          case Asse_ADDF:   *p++ = 0x58; break;
3441          case Asse_DIVF:   *p++ = 0x5E; break;
3442          case Asse_MAXF:   *p++ = 0x5F; break;
3443          case Asse_MINF:   *p++ = 0x5D; break;
3444          case Asse_MULF:   *p++ = 0x59; break;
3445          case Asse_RCPF:   *p++ = 0x53; break;
3446          case Asse_RSQRTF: *p++ = 0x52; break;
3447          case Asse_SQRTF:  *p++ = 0x51; break;
3448          case Asse_SUBF:   *p++ = 0x5C; break;
3449          case Asse_CMPEQF: *p++ = 0xC2; xtra = 0x100; break;
3450          case Asse_CMPLTF: *p++ = 0xC2; xtra = 0x101; break;
3451          case Asse_CMPLEF: *p++ = 0xC2; xtra = 0x102; break;
3452          case Asse_CMPUNF: *p++ = 0xC2; xtra = 0x103; break;
3453          default: goto bad;
3454       }
3455       p = doAMode_R_enc_enc(p, vregEnc3210(i->Ain.Sse32Fx4.dst),
3456                                vregEnc3210(i->Ain.Sse32Fx4.src) );
3457       if (xtra & 0x100)
3458          *p++ = toUChar(xtra & 0xFF);
3459       goto done;
3460 
3461    case Ain_Sse64Fx2:
3462       xtra = 0;
3463       *p++ = 0x66;
3464       *p++ = clearWBit(
3465              rexAMode_R_enc_enc( vregEnc3210(i->Ain.Sse64Fx2.dst),
3466                                  vregEnc3210(i->Ain.Sse64Fx2.src) ));
3467       *p++ = 0x0F;
3468       switch (i->Ain.Sse64Fx2.op) {
3469          case Asse_ADDF:   *p++ = 0x58; break;
3470          case Asse_DIVF:   *p++ = 0x5E; break;
3471          case Asse_MAXF:   *p++ = 0x5F; break;
3472          case Asse_MINF:   *p++ = 0x5D; break;
3473          case Asse_MULF:   *p++ = 0x59; break;
3474          case Asse_SQRTF:  *p++ = 0x51; break;
3475          case Asse_SUBF:   *p++ = 0x5C; break;
3476          case Asse_CMPEQF: *p++ = 0xC2; xtra = 0x100; break;
3477          case Asse_CMPLTF: *p++ = 0xC2; xtra = 0x101; break;
3478          case Asse_CMPLEF: *p++ = 0xC2; xtra = 0x102; break;
3479          case Asse_CMPUNF: *p++ = 0xC2; xtra = 0x103; break;
3480          default: goto bad;
3481       }
3482       p = doAMode_R_enc_enc(p, vregEnc3210(i->Ain.Sse64Fx2.dst),
3483                                vregEnc3210(i->Ain.Sse64Fx2.src) );
3484       if (xtra & 0x100)
3485          *p++ = toUChar(xtra & 0xFF);
3486       goto done;
3487 
3488    case Ain_Sse32FLo:
3489       xtra = 0;
3490       *p++ = 0xF3;
3491       *p++ = clearWBit(
3492              rexAMode_R_enc_enc( vregEnc3210(i->Ain.Sse32FLo.dst),
3493                                  vregEnc3210(i->Ain.Sse32FLo.src) ));
3494       *p++ = 0x0F;
3495       switch (i->Ain.Sse32FLo.op) {
3496          case Asse_ADDF:   *p++ = 0x58; break;
3497          case Asse_DIVF:   *p++ = 0x5E; break;
3498          case Asse_MAXF:   *p++ = 0x5F; break;
3499          case Asse_MINF:   *p++ = 0x5D; break;
3500          case Asse_MULF:   *p++ = 0x59; break;
3501          case Asse_RCPF:   *p++ = 0x53; break;
3502          case Asse_RSQRTF: *p++ = 0x52; break;
3503          case Asse_SQRTF:  *p++ = 0x51; break;
3504          case Asse_SUBF:   *p++ = 0x5C; break;
3505          case Asse_CMPEQF: *p++ = 0xC2; xtra = 0x100; break;
3506          case Asse_CMPLTF: *p++ = 0xC2; xtra = 0x101; break;
3507          case Asse_CMPLEF: *p++ = 0xC2; xtra = 0x102; break;
3508          case Asse_CMPUNF: *p++ = 0xC2; xtra = 0x103; break;
3509          default: goto bad;
3510       }
3511       p = doAMode_R_enc_enc(p, vregEnc3210(i->Ain.Sse32FLo.dst),
3512                                vregEnc3210(i->Ain.Sse32FLo.src) );
3513       if (xtra & 0x100)
3514          *p++ = toUChar(xtra & 0xFF);
3515       goto done;
3516 
3517    case Ain_Sse64FLo:
3518       xtra = 0;
3519       *p++ = 0xF2;
3520       *p++ = clearWBit(
3521              rexAMode_R_enc_enc( vregEnc3210(i->Ain.Sse64FLo.dst),
3522                                  vregEnc3210(i->Ain.Sse64FLo.src) ));
3523       *p++ = 0x0F;
3524       switch (i->Ain.Sse64FLo.op) {
3525          case Asse_ADDF:   *p++ = 0x58; break;
3526          case Asse_DIVF:   *p++ = 0x5E; break;
3527          case Asse_MAXF:   *p++ = 0x5F; break;
3528          case Asse_MINF:   *p++ = 0x5D; break;
3529          case Asse_MULF:   *p++ = 0x59; break;
3530          case Asse_SQRTF:  *p++ = 0x51; break;
3531          case Asse_SUBF:   *p++ = 0x5C; break;
3532          case Asse_CMPEQF: *p++ = 0xC2; xtra = 0x100; break;
3533          case Asse_CMPLTF: *p++ = 0xC2; xtra = 0x101; break;
3534          case Asse_CMPLEF: *p++ = 0xC2; xtra = 0x102; break;
3535          case Asse_CMPUNF: *p++ = 0xC2; xtra = 0x103; break;
3536          default: goto bad;
3537       }
3538       p = doAMode_R_enc_enc(p, vregEnc3210(i->Ain.Sse64FLo.dst),
3539                                vregEnc3210(i->Ain.Sse64FLo.src) );
3540       if (xtra & 0x100)
3541          *p++ = toUChar(xtra & 0xFF);
3542       goto done;
3543 
3544    case Ain_SseReRg:
3545 #     define XX(_n) *p++ = (_n)
3546 
3547       rex = clearWBit(
3548             rexAMode_R_enc_enc( vregEnc3210(i->Ain.SseReRg.dst),
3549                                 vregEnc3210(i->Ain.SseReRg.src) ));
3550 
3551       switch (i->Ain.SseReRg.op) {
3552          case Asse_MOV:     /*movups*/ XX(rex); XX(0x0F); XX(0x10); break;
3553          case Asse_OR:                 XX(rex); XX(0x0F); XX(0x56); break;
3554          case Asse_XOR:                XX(rex); XX(0x0F); XX(0x57); break;
3555          case Asse_AND:                XX(rex); XX(0x0F); XX(0x54); break;
3556          case Asse_ANDN:               XX(rex); XX(0x0F); XX(0x55); break;
3557          case Asse_PACKSSD:  XX(0x66); XX(rex); XX(0x0F); XX(0x6B); break;
3558          case Asse_PACKSSW:  XX(0x66); XX(rex); XX(0x0F); XX(0x63); break;
3559          case Asse_PACKUSW:  XX(0x66); XX(rex); XX(0x0F); XX(0x67); break;
3560          case Asse_ADD8:     XX(0x66); XX(rex); XX(0x0F); XX(0xFC); break;
3561          case Asse_ADD16:    XX(0x66); XX(rex); XX(0x0F); XX(0xFD); break;
3562          case Asse_ADD32:    XX(0x66); XX(rex); XX(0x0F); XX(0xFE); break;
3563          case Asse_ADD64:    XX(0x66); XX(rex); XX(0x0F); XX(0xD4); break;
3564          case Asse_QADD8S:   XX(0x66); XX(rex); XX(0x0F); XX(0xEC); break;
3565          case Asse_QADD16S:  XX(0x66); XX(rex); XX(0x0F); XX(0xED); break;
3566          case Asse_QADD8U:   XX(0x66); XX(rex); XX(0x0F); XX(0xDC); break;
3567          case Asse_QADD16U:  XX(0x66); XX(rex); XX(0x0F); XX(0xDD); break;
3568          case Asse_AVG8U:    XX(0x66); XX(rex); XX(0x0F); XX(0xE0); break;
3569          case Asse_AVG16U:   XX(0x66); XX(rex); XX(0x0F); XX(0xE3); break;
3570          case Asse_CMPEQ8:   XX(0x66); XX(rex); XX(0x0F); XX(0x74); break;
3571          case Asse_CMPEQ16:  XX(0x66); XX(rex); XX(0x0F); XX(0x75); break;
3572          case Asse_CMPEQ32:  XX(0x66); XX(rex); XX(0x0F); XX(0x76); break;
3573          case Asse_CMPGT8S:  XX(0x66); XX(rex); XX(0x0F); XX(0x64); break;
3574          case Asse_CMPGT16S: XX(0x66); XX(rex); XX(0x0F); XX(0x65); break;
3575          case Asse_CMPGT32S: XX(0x66); XX(rex); XX(0x0F); XX(0x66); break;
3576          case Asse_MAX16S:   XX(0x66); XX(rex); XX(0x0F); XX(0xEE); break;
3577          case Asse_MAX8U:    XX(0x66); XX(rex); XX(0x0F); XX(0xDE); break;
3578          case Asse_MIN16S:   XX(0x66); XX(rex); XX(0x0F); XX(0xEA); break;
3579          case Asse_MIN8U:    XX(0x66); XX(rex); XX(0x0F); XX(0xDA); break;
3580          case Asse_MULHI16U: XX(0x66); XX(rex); XX(0x0F); XX(0xE4); break;
3581          case Asse_MULHI16S: XX(0x66); XX(rex); XX(0x0F); XX(0xE5); break;
3582          case Asse_MUL16:    XX(0x66); XX(rex); XX(0x0F); XX(0xD5); break;
3583          case Asse_SHL16:    XX(0x66); XX(rex); XX(0x0F); XX(0xF1); break;
3584          case Asse_SHL32:    XX(0x66); XX(rex); XX(0x0F); XX(0xF2); break;
3585          case Asse_SHL64:    XX(0x66); XX(rex); XX(0x0F); XX(0xF3); break;
3586          case Asse_SAR16:    XX(0x66); XX(rex); XX(0x0F); XX(0xE1); break;
3587          case Asse_SAR32:    XX(0x66); XX(rex); XX(0x0F); XX(0xE2); break;
3588          case Asse_SHR16:    XX(0x66); XX(rex); XX(0x0F); XX(0xD1); break;
3589          case Asse_SHR32:    XX(0x66); XX(rex); XX(0x0F); XX(0xD2); break;
3590          case Asse_SHR64:    XX(0x66); XX(rex); XX(0x0F); XX(0xD3); break;
3591          case Asse_SUB8:     XX(0x66); XX(rex); XX(0x0F); XX(0xF8); break;
3592          case Asse_SUB16:    XX(0x66); XX(rex); XX(0x0F); XX(0xF9); break;
3593          case Asse_SUB32:    XX(0x66); XX(rex); XX(0x0F); XX(0xFA); break;
3594          case Asse_SUB64:    XX(0x66); XX(rex); XX(0x0F); XX(0xFB); break;
3595          case Asse_QSUB8S:   XX(0x66); XX(rex); XX(0x0F); XX(0xE8); break;
3596          case Asse_QSUB16S:  XX(0x66); XX(rex); XX(0x0F); XX(0xE9); break;
3597          case Asse_QSUB8U:   XX(0x66); XX(rex); XX(0x0F); XX(0xD8); break;
3598          case Asse_QSUB16U:  XX(0x66); XX(rex); XX(0x0F); XX(0xD9); break;
3599          case Asse_UNPCKHB:  XX(0x66); XX(rex); XX(0x0F); XX(0x68); break;
3600          case Asse_UNPCKHW:  XX(0x66); XX(rex); XX(0x0F); XX(0x69); break;
3601          case Asse_UNPCKHD:  XX(0x66); XX(rex); XX(0x0F); XX(0x6A); break;
3602          case Asse_UNPCKHQ:  XX(0x66); XX(rex); XX(0x0F); XX(0x6D); break;
3603          case Asse_UNPCKLB:  XX(0x66); XX(rex); XX(0x0F); XX(0x60); break;
3604          case Asse_UNPCKLW:  XX(0x66); XX(rex); XX(0x0F); XX(0x61); break;
3605          case Asse_UNPCKLD:  XX(0x66); XX(rex); XX(0x0F); XX(0x62); break;
3606          case Asse_UNPCKLQ:  XX(0x66); XX(rex); XX(0x0F); XX(0x6C); break;
3607          default: goto bad;
3608       }
3609       p = doAMode_R_enc_enc(p, vregEnc3210(i->Ain.SseReRg.dst),
3610                                vregEnc3210(i->Ain.SseReRg.src) );
3611 #     undef XX
3612       goto done;
3613 
3614    case Ain_SseCMov:
3615       /* jmp fwds if !condition */
3616       *p++ = toUChar(0x70 + (i->Ain.SseCMov.cond ^ 1));
3617       *p++ = 0; /* # of bytes in the next bit, which we don't know yet */
3618       ptmp = p;
3619 
3620       /* movaps %src, %dst */
3621       *p++ = clearWBit(
3622              rexAMode_R_enc_enc( vregEnc3210(i->Ain.SseCMov.dst),
3623                                  vregEnc3210(i->Ain.SseCMov.src) ));
3624       *p++ = 0x0F;
3625       *p++ = 0x28;
3626       p = doAMode_R_enc_enc(p, vregEnc3210(i->Ain.SseCMov.dst),
3627                                vregEnc3210(i->Ain.SseCMov.src) );
3628 
3629       /* Fill in the jump offset. */
3630       *(ptmp-1) = toUChar(p - ptmp);
3631       goto done;
3632 
3633    case Ain_SseShuf:
3634       *p++ = 0x66;
3635       *p++ = clearWBit(
3636              rexAMode_R_enc_enc( vregEnc3210(i->Ain.SseShuf.dst),
3637                                  vregEnc3210(i->Ain.SseShuf.src) ));
3638       *p++ = 0x0F;
3639       *p++ = 0x70;
3640       p = doAMode_R_enc_enc(p, vregEnc3210(i->Ain.SseShuf.dst),
3641                                vregEnc3210(i->Ain.SseShuf.src) );
3642       *p++ = (UChar)(i->Ain.SseShuf.order);
3643       goto done;
3644 
3645    //uu case Ain_AvxLdSt: {
3646    //uu    UInt vex = vexAMode_M( dvreg2ireg(i->Ain.AvxLdSt.reg),
3647    //uu                           i->Ain.AvxLdSt.addr );
3648    //uu    p = emitVexPrefix(p, vex);
3649    //uu    *p++ = toUChar(i->Ain.AvxLdSt.isLoad ? 0x10 : 0x11);
3650    //uu    p = doAMode_M(p, dvreg2ireg(i->Ain.AvxLdSt.reg), i->Ain.AvxLdSt.addr);
3651    //uu      goto done;
3652    //uu }
3653 
3654    case Ain_EvCheck: {
3655       /* We generate:
3656             (3 bytes)  decl 8(%rbp)    8 == offsetof(host_EvC_COUNTER)
3657             (2 bytes)  jns  nofail     expected taken
3658             (3 bytes)  jmp* 0(%rbp)    0 == offsetof(host_EvC_FAILADDR)
3659             nofail:
3660       */
3661       /* This is heavily asserted re instruction lengths.  It needs to
3662          be.  If we get given unexpected forms of .amCounter or
3663          .amFailAddr -- basically, anything that's not of the form
3664          uimm7(%rbp) -- they are likely to fail. */
3665       /* Note also that after the decl we must be very careful not to
3666          read the carry flag, else we get a partial flags stall.
3667          js/jns avoids that, though. */
3668       UChar* p0 = p;
3669       /* ---  decl 8(%rbp) --- */
3670       /* Need to compute the REX byte for the decl in order to prove
3671          that we don't need it, since this is a 32-bit inc and all
3672          registers involved in the amode are < r8.  "1" because
3673          there's no register in this encoding; instead the register
3674          field is used as a sub opcode.  The encoding for "decl r/m32"
3675          is FF /1, hence the "1". */
3676       rex = clearWBit(rexAMode_M_enc(1, i->Ain.EvCheck.amCounter));
3677       if (rex != 0x40) goto bad; /* We don't expect to need the REX byte. */
3678       *p++ = 0xFF;
3679       p = doAMode_M_enc(p, 1, i->Ain.EvCheck.amCounter);
3680       vassert(p - p0 == 3);
3681       /* --- jns nofail --- */
3682       *p++ = 0x79;
3683       *p++ = 0x03; /* need to check this 0x03 after the next insn */
3684       vassert(p - p0 == 5);
3685       /* --- jmp* 0(%rbp) --- */
3686       /* Once again, verify we don't need REX.  The encoding is FF /4.
3687          We don't need REX.W since by default FF /4 in 64-bit mode
3688          implies a 64 bit load. */
3689       rex = clearWBit(rexAMode_M_enc(4, i->Ain.EvCheck.amFailAddr));
3690       if (rex != 0x40) goto bad;
3691       *p++ = 0xFF;
3692       p = doAMode_M_enc(p, 4, i->Ain.EvCheck.amFailAddr);
3693       vassert(p - p0 == 8); /* also ensures that 0x03 offset above is ok */
3694       /* And crosscheck .. */
3695       vassert(evCheckSzB_AMD64() == 8);
3696       goto done;
3697    }
3698 
3699    case Ain_ProfInc: {
3700       /* We generate   movabsq $0, %r11
3701                        incq (%r11)
3702          in the expectation that a later call to LibVEX_patchProfCtr
3703          will be used to fill in the immediate field once the right
3704          value is known.
3705          49 BB 00 00 00 00 00 00 00 00
3706          49 FF 03
3707       */
3708       *p++ = 0x49; *p++ = 0xBB;
3709       *p++ = 0x00; *p++ = 0x00; *p++ = 0x00; *p++ = 0x00;
3710       *p++ = 0x00; *p++ = 0x00; *p++ = 0x00; *p++ = 0x00;
3711       *p++ = 0x49; *p++ = 0xFF; *p++ = 0x03;
3712       /* Tell the caller .. */
3713       vassert(!(*is_profInc));
3714       *is_profInc = True;
3715       goto done;
3716    }
3717 
3718    default:
3719       goto bad;
3720    }
3721 
3722   bad:
3723    ppAMD64Instr(i, mode64);
3724    vpanic("emit_AMD64Instr");
3725    /*NOTREACHED*/
3726 
3727   done:
3728    vassert(p - &buf[0] <= 32);
3729    return p - &buf[0];
3730 }
3731 
3732 
3733 /* How big is an event check?  See case for Ain_EvCheck in
3734    emit_AMD64Instr just above.  That crosschecks what this returns, so
3735    we can tell if we're inconsistent. */
evCheckSzB_AMD64(void)3736 Int evCheckSzB_AMD64 (void)
3737 {
3738    return 8;
3739 }
3740 
3741 
3742 /* NB: what goes on here has to be very closely coordinated with the
3743    emitInstr case for XDirect, above. */
chainXDirect_AMD64(VexEndness endness_host,void * place_to_chain,const void * disp_cp_chain_me_EXPECTED,const void * place_to_jump_to)3744 VexInvalRange chainXDirect_AMD64 ( VexEndness endness_host,
3745                                    void* place_to_chain,
3746                                    const void* disp_cp_chain_me_EXPECTED,
3747                                    const void* place_to_jump_to )
3748 {
3749    vassert(endness_host == VexEndnessLE);
3750 
3751    /* What we're expecting to see is:
3752         movabsq $disp_cp_chain_me_EXPECTED, %r11
3753         call *%r11
3754       viz
3755         49 BB <8 bytes value == disp_cp_chain_me_EXPECTED>
3756         41 FF D3
3757    */
3758    UChar* p = (UChar*)place_to_chain;
3759    vassert(p[0] == 0x49);
3760    vassert(p[1] == 0xBB);
3761    vassert(*(Addr*)(&p[2]) == (Addr)disp_cp_chain_me_EXPECTED);
3762    vassert(p[10] == 0x41);
3763    vassert(p[11] == 0xFF);
3764    vassert(p[12] == 0xD3);
3765    /* And what we want to change it to is either:
3766         (general case):
3767           movabsq $place_to_jump_to, %r11
3768           jmpq *%r11
3769         viz
3770           49 BB <8 bytes value == place_to_jump_to>
3771           41 FF E3
3772         So it's the same length (convenient, huh) and we don't
3773         need to change all the bits.
3774       ---OR---
3775         in the case where the displacement falls within 32 bits
3776           jmpq disp32   where disp32 is relative to the next insn
3777           ud2; ud2; ud2; ud2
3778         viz
3779           E9 <4 bytes == disp32>
3780           0F 0B 0F 0B 0F 0B 0F 0B
3781 
3782       In both cases the replacement has the same length as the original.
3783       To remain sane & verifiable,
3784       (1) limit the displacement for the short form to
3785           (say) +/- one billion, so as to avoid wraparound
3786           off-by-ones
3787       (2) even if the short form is applicable, once every (say)
3788           1024 times use the long form anyway, so as to maintain
3789           verifiability
3790    */
3791    /* This is the delta we need to put into a JMP d32 insn.  It's
3792       relative to the start of the next insn, hence the -5.  */
3793    Long delta   = (Long)((const UChar *)place_to_jump_to - (const UChar*)p) - 5;
3794    Bool shortOK = delta >= -1000*1000*1000 && delta < 1000*1000*1000;
3795 
3796    static UInt shortCTR = 0; /* DO NOT MAKE NON-STATIC */
3797    if (shortOK) {
3798       shortCTR++; // thread safety bleh
3799       if (0 == (shortCTR & 0x3FF)) {
3800          shortOK = False;
3801          if (0)
3802             vex_printf("QQQ chainXDirect_AMD64: shortCTR = %u, "
3803                        "using long jmp\n", shortCTR);
3804       }
3805    }
3806 
3807    /* And make the modifications. */
3808    if (shortOK) {
3809       p[0]  = 0xE9;
3810       p[1]  = (delta >> 0) & 0xFF;
3811       p[2]  = (delta >> 8) & 0xFF;
3812       p[3]  = (delta >> 16) & 0xFF;
3813       p[4]  = (delta >> 24) & 0xFF;
3814       p[5]  = 0x0F; p[6]  = 0x0B;
3815       p[7]  = 0x0F; p[8]  = 0x0B;
3816       p[9]  = 0x0F; p[10] = 0x0B;
3817       p[11] = 0x0F; p[12] = 0x0B;
3818       /* sanity check on the delta -- top 32 are all 0 or all 1 */
3819       delta >>= 32;
3820       vassert(delta == 0LL || delta == -1LL);
3821    } else {
3822       /* Minimal modifications from the starting sequence. */
3823      *(Addr*)(&p[2]) = (Addr)place_to_jump_to;
3824       p[12] = 0xE3;
3825    }
3826    VexInvalRange vir = { (HWord)place_to_chain, 13 };
3827    return vir;
3828 }
3829 
3830 
3831 /* NB: what goes on here has to be very closely coordinated with the
3832    emitInstr case for XDirect, above. */
unchainXDirect_AMD64(VexEndness endness_host,void * place_to_unchain,const void * place_to_jump_to_EXPECTED,const void * disp_cp_chain_me)3833 VexInvalRange unchainXDirect_AMD64 ( VexEndness endness_host,
3834                                      void* place_to_unchain,
3835                                      const void* place_to_jump_to_EXPECTED,
3836                                      const void* disp_cp_chain_me )
3837 {
3838    vassert(endness_host == VexEndnessLE);
3839 
3840    /* What we're expecting to see is either:
3841         (general case)
3842           movabsq $place_to_jump_to_EXPECTED, %r11
3843           jmpq *%r11
3844         viz
3845           49 BB <8 bytes value == place_to_jump_to_EXPECTED>
3846           41 FF E3
3847       ---OR---
3848         in the case where the displacement falls within 32 bits
3849           jmpq d32
3850           ud2; ud2; ud2; ud2
3851         viz
3852           E9 <4 bytes == disp32>
3853           0F 0B 0F 0B 0F 0B 0F 0B
3854    */
3855    UChar* p     = (UChar*)place_to_unchain;
3856    Bool   valid = False;
3857    if (p[0] == 0x49 && p[1] == 0xBB
3858        && *(Addr*)(&p[2]) == (Addr)place_to_jump_to_EXPECTED
3859        && p[10] == 0x41 && p[11] == 0xFF && p[12] == 0xE3) {
3860       /* it's the long form */
3861       valid = True;
3862    }
3863    else
3864    if (p[0] == 0xE9
3865        && p[5]  == 0x0F && p[6]  == 0x0B
3866        && p[7]  == 0x0F && p[8]  == 0x0B
3867        && p[9]  == 0x0F && p[10] == 0x0B
3868        && p[11] == 0x0F && p[12] == 0x0B) {
3869       /* It's the short form.  Check the offset is right. */
3870       Int  s32 = *(Int*)(&p[1]);
3871       Long s64 = (Long)s32;
3872       if ((UChar*)p + 5 + s64 == place_to_jump_to_EXPECTED) {
3873          valid = True;
3874          if (0)
3875             vex_printf("QQQ unchainXDirect_AMD64: found short form\n");
3876       }
3877    }
3878    vassert(valid);
3879    /* And what we want to change it to is:
3880         movabsq $disp_cp_chain_me, %r11
3881         call *%r11
3882       viz
3883         49 BB <8 bytes value == disp_cp_chain_me>
3884         41 FF D3
3885       So it's the same length (convenient, huh).
3886    */
3887    p[0] = 0x49;
3888    p[1] = 0xBB;
3889    *(Addr*)(&p[2]) = (Addr)disp_cp_chain_me;
3890    p[10] = 0x41;
3891    p[11] = 0xFF;
3892    p[12] = 0xD3;
3893    VexInvalRange vir = { (HWord)place_to_unchain, 13 };
3894    return vir;
3895 }
3896 
3897 
3898 /* Patch the counter address into a profile inc point, as previously
3899    created by the Ain_ProfInc case for emit_AMD64Instr. */
patchProfInc_AMD64(VexEndness endness_host,void * place_to_patch,const ULong * location_of_counter)3900 VexInvalRange patchProfInc_AMD64 ( VexEndness endness_host,
3901                                    void*  place_to_patch,
3902                                    const ULong* location_of_counter )
3903 {
3904    vassert(endness_host == VexEndnessLE);
3905    vassert(sizeof(ULong*) == 8);
3906    UChar* p = (UChar*)place_to_patch;
3907    vassert(p[0] == 0x49);
3908    vassert(p[1] == 0xBB);
3909    vassert(p[2] == 0x00);
3910    vassert(p[3] == 0x00);
3911    vassert(p[4] == 0x00);
3912    vassert(p[5] == 0x00);
3913    vassert(p[6] == 0x00);
3914    vassert(p[7] == 0x00);
3915    vassert(p[8] == 0x00);
3916    vassert(p[9] == 0x00);
3917    vassert(p[10] == 0x49);
3918    vassert(p[11] == 0xFF);
3919    vassert(p[12] == 0x03);
3920    ULong imm64 = (ULong)(Addr)location_of_counter;
3921    p[2] = imm64 & 0xFF; imm64 >>= 8;
3922    p[3] = imm64 & 0xFF; imm64 >>= 8;
3923    p[4] = imm64 & 0xFF; imm64 >>= 8;
3924    p[5] = imm64 & 0xFF; imm64 >>= 8;
3925    p[6] = imm64 & 0xFF; imm64 >>= 8;
3926    p[7] = imm64 & 0xFF; imm64 >>= 8;
3927    p[8] = imm64 & 0xFF; imm64 >>= 8;
3928    p[9] = imm64 & 0xFF; imm64 >>= 8;
3929    VexInvalRange vir = { (HWord)place_to_patch, 13 };
3930    return vir;
3931 }
3932 
3933 
3934 /*---------------------------------------------------------------*/
3935 /*--- end                                   host_amd64_defs.c ---*/
3936 /*---------------------------------------------------------------*/
3937