1
2 /*---------------------------------------------------------------*/
3 /*--- begin host_amd64_isel.c ---*/
4 /*---------------------------------------------------------------*/
5
6 /*
7 This file is part of Valgrind, a dynamic binary instrumentation
8 framework.
9
10 Copyright (C) 2004-2013 OpenWorks LLP
11 info@open-works.net
12
13 This program is free software; you can redistribute it and/or
14 modify it under the terms of the GNU General Public License as
15 published by the Free Software Foundation; either version 2 of the
16 License, or (at your option) any later version.
17
18 This program is distributed in the hope that it will be useful, but
19 WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21 General Public License for more details.
22
23 You should have received a copy of the GNU General Public License
24 along with this program; if not, write to the Free Software
25 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
26 02110-1301, USA.
27
28 The GNU General Public License is contained in the file COPYING.
29
30 Neither the names of the U.S. Department of Energy nor the
31 University of California nor the names of its contributors may be
32 used to endorse or promote products derived from this software
33 without prior written permission.
34 */
35
36 #include "libvex_basictypes.h"
37 #include "libvex_ir.h"
38 #include "libvex.h"
39
40 #include "ir_match.h"
41 #include "main_util.h"
42 #include "main_globals.h"
43 #include "host_generic_regs.h"
44 #include "host_generic_simd64.h"
45 #include "host_generic_simd128.h"
46 #include "host_generic_simd256.h"
47 #include "host_generic_maddf.h"
48 #include "host_amd64_defs.h"
49
50
51 /*---------------------------------------------------------*/
52 /*--- x87/SSE control word stuff ---*/
53 /*---------------------------------------------------------*/
54
55 /* Vex-generated code expects to run with the FPU set as follows: all
56 exceptions masked, round-to-nearest, precision = 53 bits. This
57 corresponds to a FPU control word value of 0x027F.
58
59 Similarly the SSE control word (%mxcsr) should be 0x1F80.
60
61 %fpucw and %mxcsr should have these values on entry to
62 Vex-generated code, and should those values should be
63 unchanged at exit.
64 */
65
66 #define DEFAULT_FPUCW 0x027F
67
68 #define DEFAULT_MXCSR 0x1F80
69
70 /* debugging only, do not use */
71 /* define DEFAULT_FPUCW 0x037F */
72
73
74 /*---------------------------------------------------------*/
75 /*--- misc helpers ---*/
76 /*---------------------------------------------------------*/
77
78 /* These are duplicated in guest-amd64/toIR.c */
unop(IROp op,IRExpr * a)79 static IRExpr* unop ( IROp op, IRExpr* a )
80 {
81 return IRExpr_Unop(op, a);
82 }
83
binop(IROp op,IRExpr * a1,IRExpr * a2)84 static IRExpr* binop ( IROp op, IRExpr* a1, IRExpr* a2 )
85 {
86 return IRExpr_Binop(op, a1, a2);
87 }
88
bind(Int binder)89 static IRExpr* bind ( Int binder )
90 {
91 return IRExpr_Binder(binder);
92 }
93
isZeroU8(IRExpr * e)94 static Bool isZeroU8 ( IRExpr* e )
95 {
96 return e->tag == Iex_Const
97 && e->Iex.Const.con->tag == Ico_U8
98 && e->Iex.Const.con->Ico.U8 == 0;
99 }
100
101
102 /*---------------------------------------------------------*/
103 /*--- ISelEnv ---*/
104 /*---------------------------------------------------------*/
105
106 /* This carries around:
107
108 - A mapping from IRTemp to IRType, giving the type of any IRTemp we
109 might encounter. This is computed before insn selection starts,
110 and does not change.
111
112 - A mapping from IRTemp to HReg. This tells the insn selector
113 which virtual register is associated with each IRTemp
114 temporary. This is computed before insn selection starts, and
115 does not change. We expect this mapping to map precisely the
116 same set of IRTemps as the type mapping does.
117
118 - vregmap holds the primary register for the IRTemp.
119 - vregmapHI is only used for 128-bit integer-typed
120 IRTemps. It holds the identity of a second
121 64-bit virtual HReg, which holds the high half
122 of the value.
123
124 - The host subarchitecture we are selecting insns for.
125 This is set at the start and does not change.
126
127 - The code array, that is, the insns selected so far.
128
129 - A counter, for generating new virtual registers.
130
131 - A Bool for indicating whether we may generate chain-me
132 instructions for control flow transfers, or whether we must use
133 XAssisted.
134
135 - The maximum guest address of any guest insn in this block.
136 Actually, the address of the highest-addressed byte from any insn
137 in this block. Is set at the start and does not change. This is
138 used for detecting jumps which are definitely forward-edges from
139 this block, and therefore can be made (chained) to the fast entry
140 point of the destination, thereby avoiding the destination's
141 event check.
142
143 Note, this is all host-independent. (JRS 20050201: well, kinda
144 ... not completely. Compare with ISelEnv for X86.)
145 */
146
147 typedef
148 struct {
149 /* Constant -- are set at the start and do not change. */
150 IRTypeEnv* type_env;
151
152 HReg* vregmap;
153 HReg* vregmapHI;
154 Int n_vregmap;
155
156 UInt hwcaps;
157
158 Bool chainingAllowed;
159 Addr64 max_ga;
160
161 /* These are modified as we go along. */
162 HInstrArray* code;
163 Int vreg_ctr;
164 }
165 ISelEnv;
166
167
lookupIRTemp(ISelEnv * env,IRTemp tmp)168 static HReg lookupIRTemp ( ISelEnv* env, IRTemp tmp )
169 {
170 vassert(tmp >= 0);
171 vassert(tmp < env->n_vregmap);
172 return env->vregmap[tmp];
173 }
174
lookupIRTempPair(HReg * vrHI,HReg * vrLO,ISelEnv * env,IRTemp tmp)175 static void lookupIRTempPair ( HReg* vrHI, HReg* vrLO,
176 ISelEnv* env, IRTemp tmp )
177 {
178 vassert(tmp >= 0);
179 vassert(tmp < env->n_vregmap);
180 vassert(! hregIsInvalid(env->vregmapHI[tmp]));
181 *vrLO = env->vregmap[tmp];
182 *vrHI = env->vregmapHI[tmp];
183 }
184
addInstr(ISelEnv * env,AMD64Instr * instr)185 static void addInstr ( ISelEnv* env, AMD64Instr* instr )
186 {
187 addHInstr(env->code, instr);
188 if (vex_traceflags & VEX_TRACE_VCODE) {
189 ppAMD64Instr(instr, True);
190 vex_printf("\n");
191 }
192 }
193
newVRegI(ISelEnv * env)194 static HReg newVRegI ( ISelEnv* env )
195 {
196 HReg reg = mkHReg(True/*virtual reg*/, HRcInt64, 0/*enc*/, env->vreg_ctr);
197 env->vreg_ctr++;
198 return reg;
199 }
200
newVRegV(ISelEnv * env)201 static HReg newVRegV ( ISelEnv* env )
202 {
203 HReg reg = mkHReg(True/*virtual reg*/, HRcVec128, 0/*enc*/, env->vreg_ctr);
204 env->vreg_ctr++;
205 return reg;
206 }
207
208
209 /*---------------------------------------------------------*/
210 /*--- ISEL: Forward declarations ---*/
211 /*---------------------------------------------------------*/
212
213 /* These are organised as iselXXX and iselXXX_wrk pairs. The
214 iselXXX_wrk do the real work, but are not to be called directly.
215 For each XXX, iselXXX calls its iselXXX_wrk counterpart, then
216 checks that all returned registers are virtual. You should not
217 call the _wrk version directly.
218 */
219 static AMD64RMI* iselIntExpr_RMI_wrk ( ISelEnv* env, IRExpr* e );
220 static AMD64RMI* iselIntExpr_RMI ( ISelEnv* env, IRExpr* e );
221
222 static AMD64RI* iselIntExpr_RI_wrk ( ISelEnv* env, IRExpr* e );
223 static AMD64RI* iselIntExpr_RI ( ISelEnv* env, IRExpr* e );
224
225 static AMD64RM* iselIntExpr_RM_wrk ( ISelEnv* env, IRExpr* e );
226 static AMD64RM* iselIntExpr_RM ( ISelEnv* env, IRExpr* e );
227
228 static HReg iselIntExpr_R_wrk ( ISelEnv* env, IRExpr* e );
229 static HReg iselIntExpr_R ( ISelEnv* env, IRExpr* e );
230
231 static AMD64AMode* iselIntExpr_AMode_wrk ( ISelEnv* env, IRExpr* e );
232 static AMD64AMode* iselIntExpr_AMode ( ISelEnv* env, IRExpr* e );
233
234 static void iselInt128Expr_wrk ( /*OUT*/HReg* rHi, HReg* rLo,
235 ISelEnv* env, IRExpr* e );
236 static void iselInt128Expr ( /*OUT*/HReg* rHi, HReg* rLo,
237 ISelEnv* env, IRExpr* e );
238
239 static AMD64CondCode iselCondCode_wrk ( ISelEnv* env, IRExpr* e );
240 static AMD64CondCode iselCondCode ( ISelEnv* env, IRExpr* e );
241
242 static HReg iselDblExpr_wrk ( ISelEnv* env, IRExpr* e );
243 static HReg iselDblExpr ( ISelEnv* env, IRExpr* e );
244
245 static HReg iselFltExpr_wrk ( ISelEnv* env, IRExpr* e );
246 static HReg iselFltExpr ( ISelEnv* env, IRExpr* e );
247
248 static HReg iselVecExpr_wrk ( ISelEnv* env, IRExpr* e );
249 static HReg iselVecExpr ( ISelEnv* env, IRExpr* e );
250
251 static void iselDVecExpr_wrk ( /*OUT*/HReg* rHi, HReg* rLo,
252 ISelEnv* env, IRExpr* e );
253 static void iselDVecExpr ( /*OUT*/HReg* rHi, HReg* rLo,
254 ISelEnv* env, IRExpr* e );
255
256
257 /*---------------------------------------------------------*/
258 /*--- ISEL: Misc helpers ---*/
259 /*---------------------------------------------------------*/
260
sane_AMode(AMD64AMode * am)261 static Bool sane_AMode ( AMD64AMode* am )
262 {
263 switch (am->tag) {
264 case Aam_IR:
265 return
266 toBool( hregClass(am->Aam.IR.reg) == HRcInt64
267 && (hregIsVirtual(am->Aam.IR.reg)
268 || sameHReg(am->Aam.IR.reg, hregAMD64_RBP())) );
269 case Aam_IRRS:
270 return
271 toBool( hregClass(am->Aam.IRRS.base) == HRcInt64
272 && hregIsVirtual(am->Aam.IRRS.base)
273 && hregClass(am->Aam.IRRS.index) == HRcInt64
274 && hregIsVirtual(am->Aam.IRRS.index) );
275 default:
276 vpanic("sane_AMode: unknown amd64 amode tag");
277 }
278 }
279
280
281 /* Can the lower 32 bits be signedly widened to produce the whole
282 64-bit value? In other words, are the top 33 bits either all 0 or
283 all 1 ? */
fitsIn32Bits(ULong x)284 static Bool fitsIn32Bits ( ULong x )
285 {
286 Long y1;
287 y1 = x << 32;
288 y1 >>=/*s*/ 32;
289 return toBool(x == y1);
290 }
291
292 /* Is this a 64-bit zero expression? */
293
isZeroU64(IRExpr * e)294 static Bool isZeroU64 ( IRExpr* e )
295 {
296 return e->tag == Iex_Const
297 && e->Iex.Const.con->tag == Ico_U64
298 && e->Iex.Const.con->Ico.U64 == 0ULL;
299 }
300
isZeroU32(IRExpr * e)301 static Bool isZeroU32 ( IRExpr* e )
302 {
303 return e->tag == Iex_Const
304 && e->Iex.Const.con->tag == Ico_U32
305 && e->Iex.Const.con->Ico.U32 == 0;
306 }
307
308 /* Make a int reg-reg move. */
309
mk_iMOVsd_RR(HReg src,HReg dst)310 static AMD64Instr* mk_iMOVsd_RR ( HReg src, HReg dst )
311 {
312 vassert(hregClass(src) == HRcInt64);
313 vassert(hregClass(dst) == HRcInt64);
314 return AMD64Instr_Alu64R(Aalu_MOV, AMD64RMI_Reg(src), dst);
315 }
316
317 /* Make a vector (128 bit) reg-reg move. */
318
mk_vMOVsd_RR(HReg src,HReg dst)319 static AMD64Instr* mk_vMOVsd_RR ( HReg src, HReg dst )
320 {
321 vassert(hregClass(src) == HRcVec128);
322 vassert(hregClass(dst) == HRcVec128);
323 return AMD64Instr_SseReRg(Asse_MOV, src, dst);
324 }
325
326 /* Advance/retreat %rsp by n. */
327
add_to_rsp(ISelEnv * env,Int n)328 static void add_to_rsp ( ISelEnv* env, Int n )
329 {
330 vassert(n > 0 && n < 256 && (n%8) == 0);
331 addInstr(env,
332 AMD64Instr_Alu64R(Aalu_ADD, AMD64RMI_Imm(n),
333 hregAMD64_RSP()));
334 }
335
sub_from_rsp(ISelEnv * env,Int n)336 static void sub_from_rsp ( ISelEnv* env, Int n )
337 {
338 vassert(n > 0 && n < 256 && (n%8) == 0);
339 addInstr(env,
340 AMD64Instr_Alu64R(Aalu_SUB, AMD64RMI_Imm(n),
341 hregAMD64_RSP()));
342 }
343
344 /* Push 64-bit constants on the stack. */
push_uimm64(ISelEnv * env,ULong uimm64)345 static void push_uimm64( ISelEnv* env, ULong uimm64 )
346 {
347 /* If uimm64 can be expressed as the sign extension of its
348 lower 32 bits, we can do it the easy way. */
349 Long simm64 = (Long)uimm64;
350 if ( simm64 == ((Long)(uimm64 << 32) >> 32) ) {
351 addInstr( env, AMD64Instr_Push(AMD64RMI_Imm( (UInt)uimm64 )) );
352 } else {
353 HReg tmp = newVRegI(env);
354 addInstr( env, AMD64Instr_Imm64(uimm64, tmp) );
355 addInstr( env, AMD64Instr_Push(AMD64RMI_Reg(tmp)) );
356 }
357 }
358
359
360 /* Used only in doHelperCall. If possible, produce a single
361 instruction which computes 'e' into 'dst'. If not possible, return
362 NULL. */
363
iselIntExpr_single_instruction(ISelEnv * env,HReg dst,IRExpr * e)364 static AMD64Instr* iselIntExpr_single_instruction ( ISelEnv* env,
365 HReg dst,
366 IRExpr* e )
367 {
368 /* Per comments in doHelperCall below, appearance of
369 Iex_VECRET implies ill-formed IR. */
370 vassert(e->tag != Iex_VECRET);
371
372 /* In this case we give out a copy of the BaseBlock pointer. */
373 if (UNLIKELY(e->tag == Iex_BBPTR)) {
374 return mk_iMOVsd_RR( hregAMD64_RBP(), dst );
375 }
376
377 vassert(typeOfIRExpr(env->type_env, e) == Ity_I64);
378
379 if (e->tag == Iex_Const) {
380 vassert(e->Iex.Const.con->tag == Ico_U64);
381 if (fitsIn32Bits(e->Iex.Const.con->Ico.U64)) {
382 return AMD64Instr_Alu64R(
383 Aalu_MOV,
384 AMD64RMI_Imm(toUInt(e->Iex.Const.con->Ico.U64)),
385 dst
386 );
387 } else {
388 return AMD64Instr_Imm64(e->Iex.Const.con->Ico.U64, dst);
389 }
390 }
391
392 if (e->tag == Iex_RdTmp) {
393 HReg src = lookupIRTemp(env, e->Iex.RdTmp.tmp);
394 return mk_iMOVsd_RR(src, dst);
395 }
396
397 if (e->tag == Iex_Get) {
398 vassert(e->Iex.Get.ty == Ity_I64);
399 return AMD64Instr_Alu64R(
400 Aalu_MOV,
401 AMD64RMI_Mem(
402 AMD64AMode_IR(e->Iex.Get.offset,
403 hregAMD64_RBP())),
404 dst);
405 }
406
407 if (e->tag == Iex_Unop
408 && e->Iex.Unop.op == Iop_32Uto64
409 && e->Iex.Unop.arg->tag == Iex_RdTmp) {
410 HReg src = lookupIRTemp(env, e->Iex.Unop.arg->Iex.RdTmp.tmp);
411 return AMD64Instr_MovxLQ(False, src, dst);
412 }
413
414 if (0) { ppIRExpr(e); vex_printf("\n"); }
415
416 return NULL;
417 }
418
419
420 /* Do a complete function call. |guard| is a Ity_Bit expression
421 indicating whether or not the call happens. If guard==NULL, the
422 call is unconditional. |retloc| is set to indicate where the
423 return value is after the call. The caller (of this fn) must
424 generate code to add |stackAdjustAfterCall| to the stack pointer
425 after the call is done. */
426
427 static
doHelperCall(UInt * stackAdjustAfterCall,RetLoc * retloc,ISelEnv * env,IRExpr * guard,IRCallee * cee,IRType retTy,IRExpr ** args)428 void doHelperCall ( /*OUT*/UInt* stackAdjustAfterCall,
429 /*OUT*/RetLoc* retloc,
430 ISelEnv* env,
431 IRExpr* guard,
432 IRCallee* cee, IRType retTy, IRExpr** args )
433 {
434 AMD64CondCode cc;
435 HReg argregs[6];
436 HReg tmpregs[6];
437 AMD64Instr* fastinstrs[6];
438 UInt n_args, i;
439
440 /* Set default returns. We'll update them later if needed. */
441 *stackAdjustAfterCall = 0;
442 *retloc = mk_RetLoc_INVALID();
443
444 /* These are used for cross-checking that IR-level constraints on
445 the use of IRExpr_VECRET() and IRExpr_BBPTR() are observed. */
446 UInt nVECRETs = 0;
447 UInt nBBPTRs = 0;
448
449 /* Marshal args for a call and do the call.
450
451 This function only deals with a tiny set of possibilities, which
452 cover all helpers in practice. The restrictions are that only
453 arguments in registers are supported, hence only 6x64 integer
454 bits in total can be passed. In fact the only supported arg
455 type is I64.
456
457 The return type can be I{64,32,16,8} or V{128,256}. In the
458 latter two cases, it is expected that |args| will contain the
459 special node IRExpr_VECRET(), in which case this routine
460 generates code to allocate space on the stack for the vector
461 return value. Since we are not passing any scalars on the
462 stack, it is enough to preallocate the return space before
463 marshalling any arguments, in this case.
464
465 |args| may also contain IRExpr_BBPTR(), in which case the
466 value in %rbp is passed as the corresponding argument.
467
468 Generating code which is both efficient and correct when
469 parameters are to be passed in registers is difficult, for the
470 reasons elaborated in detail in comments attached to
471 doHelperCall() in priv/host-x86/isel.c. Here, we use a variant
472 of the method described in those comments.
473
474 The problem is split into two cases: the fast scheme and the
475 slow scheme. In the fast scheme, arguments are computed
476 directly into the target (real) registers. This is only safe
477 when we can be sure that computation of each argument will not
478 trash any real registers set by computation of any other
479 argument.
480
481 In the slow scheme, all args are first computed into vregs, and
482 once they are all done, they are moved to the relevant real
483 regs. This always gives correct code, but it also gives a bunch
484 of vreg-to-rreg moves which are usually redundant but are hard
485 for the register allocator to get rid of.
486
487 To decide which scheme to use, all argument expressions are
488 first examined. If they are all so simple that it is clear they
489 will be evaluated without use of any fixed registers, use the
490 fast scheme, else use the slow scheme. Note also that only
491 unconditional calls may use the fast scheme, since having to
492 compute a condition expression could itself trash real
493 registers. Note that for simplicity, in the case where
494 IRExpr_VECRET() is present, we use the slow scheme. This is
495 motivated by the desire to avoid any possible complexity
496 w.r.t. nested calls.
497
498 Note this requires being able to examine an expression and
499 determine whether or not evaluation of it might use a fixed
500 register. That requires knowledge of how the rest of this insn
501 selector works. Currently just the following 3 are regarded as
502 safe -- hopefully they cover the majority of arguments in
503 practice: IRExpr_Tmp IRExpr_Const IRExpr_Get.
504 */
505
506 /* Note that the cee->regparms field is meaningless on AMD64 host
507 (since there is only one calling convention) and so we always
508 ignore it. */
509 n_args = 0;
510 for (i = 0; args[i]; i++)
511 n_args++;
512
513 if (n_args > 6)
514 vpanic("doHelperCall(AMD64): cannot currently handle > 6 args");
515
516 argregs[0] = hregAMD64_RDI();
517 argregs[1] = hregAMD64_RSI();
518 argregs[2] = hregAMD64_RDX();
519 argregs[3] = hregAMD64_RCX();
520 argregs[4] = hregAMD64_R8();
521 argregs[5] = hregAMD64_R9();
522
523 tmpregs[0] = tmpregs[1] = tmpregs[2] =
524 tmpregs[3] = tmpregs[4] = tmpregs[5] = INVALID_HREG;
525
526 fastinstrs[0] = fastinstrs[1] = fastinstrs[2] =
527 fastinstrs[3] = fastinstrs[4] = fastinstrs[5] = NULL;
528
529 /* First decide which scheme (slow or fast) is to be used. First
530 assume the fast scheme, and select slow if any contraindications
531 (wow) appear. */
532
533 /* We'll need space on the stack for the return value. Avoid
534 possible complications with nested calls by using the slow
535 scheme. */
536 if (retTy == Ity_V128 || retTy == Ity_V256)
537 goto slowscheme;
538
539 if (guard) {
540 if (guard->tag == Iex_Const
541 && guard->Iex.Const.con->tag == Ico_U1
542 && guard->Iex.Const.con->Ico.U1 == True) {
543 /* unconditional */
544 } else {
545 /* Not manifestly unconditional -- be conservative. */
546 goto slowscheme;
547 }
548 }
549
550 /* Ok, let's try for the fast scheme. If it doesn't pan out, we'll
551 use the slow scheme. Because this is tentative, we can't call
552 addInstr (that is, commit to) any instructions until we're
553 handled all the arguments. So park the resulting instructions
554 in a buffer and emit that if we're successful. */
555
556 /* FAST SCHEME */
557 /* In this loop, we process args that can be computed into the
558 destination (real) register with a single instruction, without
559 using any fixed regs. That also includes IRExpr_BBPTR(), but
560 not IRExpr_VECRET(). Indeed, if the IR is well-formed, we can
561 never see IRExpr_VECRET() at this point, since the return-type
562 check above should ensure all those cases use the slow scheme
563 instead. */
564 vassert(n_args >= 0 && n_args <= 6);
565 for (i = 0; i < n_args; i++) {
566 IRExpr* arg = args[i];
567 if (LIKELY(!is_IRExpr_VECRET_or_BBPTR(arg))) {
568 vassert(typeOfIRExpr(env->type_env, args[i]) == Ity_I64);
569 }
570 fastinstrs[i]
571 = iselIntExpr_single_instruction( env, argregs[i], args[i] );
572 if (fastinstrs[i] == NULL)
573 goto slowscheme;
574 }
575
576 /* Looks like we're in luck. Emit the accumulated instructions and
577 move on to doing the call itself. */
578 for (i = 0; i < n_args; i++)
579 addInstr(env, fastinstrs[i]);
580
581 /* Fast scheme only applies for unconditional calls. Hence: */
582 cc = Acc_ALWAYS;
583
584 goto handle_call;
585
586
587 /* SLOW SCHEME; move via temporaries */
588 slowscheme:
589 {}
590 # if 0 /* debug only */
591 if (n_args > 0) {for (i = 0; args[i]; i++) {
592 ppIRExpr(args[i]); vex_printf(" "); }
593 vex_printf("\n");}
594 # endif
595
596 /* If we have a vector return type, allocate a place for it on the
597 stack and record its address. */
598 HReg r_vecRetAddr = INVALID_HREG;
599 if (retTy == Ity_V128) {
600 r_vecRetAddr = newVRegI(env);
601 sub_from_rsp(env, 16);
602 addInstr(env, mk_iMOVsd_RR( hregAMD64_RSP(), r_vecRetAddr ));
603 }
604 else if (retTy == Ity_V256) {
605 r_vecRetAddr = newVRegI(env);
606 sub_from_rsp(env, 32);
607 addInstr(env, mk_iMOVsd_RR( hregAMD64_RSP(), r_vecRetAddr ));
608 }
609
610 vassert(n_args >= 0 && n_args <= 6);
611 for (i = 0; i < n_args; i++) {
612 IRExpr* arg = args[i];
613 if (UNLIKELY(arg->tag == Iex_BBPTR)) {
614 tmpregs[i] = newVRegI(env);
615 addInstr(env, mk_iMOVsd_RR( hregAMD64_RBP(), tmpregs[i]));
616 nBBPTRs++;
617 }
618 else if (UNLIKELY(arg->tag == Iex_VECRET)) {
619 /* We stashed the address of the return slot earlier, so just
620 retrieve it now. */
621 vassert(!hregIsInvalid(r_vecRetAddr));
622 tmpregs[i] = r_vecRetAddr;
623 nVECRETs++;
624 }
625 else {
626 vassert(typeOfIRExpr(env->type_env, args[i]) == Ity_I64);
627 tmpregs[i] = iselIntExpr_R(env, args[i]);
628 }
629 }
630
631 /* Now we can compute the condition. We can't do it earlier
632 because the argument computations could trash the condition
633 codes. Be a bit clever to handle the common case where the
634 guard is 1:Bit. */
635 cc = Acc_ALWAYS;
636 if (guard) {
637 if (guard->tag == Iex_Const
638 && guard->Iex.Const.con->tag == Ico_U1
639 && guard->Iex.Const.con->Ico.U1 == True) {
640 /* unconditional -- do nothing */
641 } else {
642 cc = iselCondCode( env, guard );
643 }
644 }
645
646 /* Move the args to their final destinations. */
647 for (i = 0; i < n_args; i++) {
648 /* None of these insns, including any spill code that might
649 be generated, may alter the condition codes. */
650 addInstr( env, mk_iMOVsd_RR( tmpregs[i], argregs[i] ) );
651 }
652
653
654 /* Do final checks, set the return values, and generate the call
655 instruction proper. */
656 handle_call:
657
658 if (retTy == Ity_V128 || retTy == Ity_V256) {
659 vassert(nVECRETs == 1);
660 } else {
661 vassert(nVECRETs == 0);
662 }
663
664 vassert(nBBPTRs == 0 || nBBPTRs == 1);
665
666 vassert(*stackAdjustAfterCall == 0);
667 vassert(is_RetLoc_INVALID(*retloc));
668 switch (retTy) {
669 case Ity_INVALID:
670 /* Function doesn't return a value. */
671 *retloc = mk_RetLoc_simple(RLPri_None);
672 break;
673 case Ity_I64: case Ity_I32: case Ity_I16: case Ity_I8:
674 *retloc = mk_RetLoc_simple(RLPri_Int);
675 break;
676 case Ity_V128:
677 *retloc = mk_RetLoc_spRel(RLPri_V128SpRel, 0);
678 *stackAdjustAfterCall = 16;
679 break;
680 case Ity_V256:
681 *retloc = mk_RetLoc_spRel(RLPri_V256SpRel, 0);
682 *stackAdjustAfterCall = 32;
683 break;
684 default:
685 /* IR can denote other possible return types, but we don't
686 handle those here. */
687 vassert(0);
688 }
689
690 /* Finally, generate the call itself. This needs the *retloc value
691 set in the switch above, which is why it's at the end. */
692 addInstr(env,
693 AMD64Instr_Call(cc, (Addr)cee->addr, n_args, *retloc));
694 }
695
696
697 /* Given a guest-state array descriptor, an index expression and a
698 bias, generate an AMD64AMode holding the relevant guest state
699 offset. */
700
701 static
genGuestArrayOffset(ISelEnv * env,IRRegArray * descr,IRExpr * off,Int bias)702 AMD64AMode* genGuestArrayOffset ( ISelEnv* env, IRRegArray* descr,
703 IRExpr* off, Int bias )
704 {
705 HReg tmp, roff;
706 Int elemSz = sizeofIRType(descr->elemTy);
707 Int nElems = descr->nElems;
708
709 /* Throw out any cases not generated by an amd64 front end. In
710 theory there might be a day where we need to handle them -- if
711 we ever run non-amd64-guest on amd64 host. */
712
713 if (nElems != 8 || (elemSz != 1 && elemSz != 8))
714 vpanic("genGuestArrayOffset(amd64 host)");
715
716 /* Compute off into a reg, %off. Then return:
717
718 movq %off, %tmp
719 addq $bias, %tmp (if bias != 0)
720 andq %tmp, 7
721 ... base(%rbp, %tmp, shift) ...
722 */
723 tmp = newVRegI(env);
724 roff = iselIntExpr_R(env, off);
725 addInstr(env, mk_iMOVsd_RR(roff, tmp));
726 if (bias != 0) {
727 /* Make sure the bias is sane, in the sense that there are
728 no significant bits above bit 30 in it. */
729 vassert(-10000 < bias && bias < 10000);
730 addInstr(env,
731 AMD64Instr_Alu64R(Aalu_ADD, AMD64RMI_Imm(bias), tmp));
732 }
733 addInstr(env,
734 AMD64Instr_Alu64R(Aalu_AND, AMD64RMI_Imm(7), tmp));
735 vassert(elemSz == 1 || elemSz == 8);
736 return
737 AMD64AMode_IRRS( descr->base, hregAMD64_RBP(), tmp,
738 elemSz==8 ? 3 : 0);
739 }
740
741
742 /* Set the SSE unit's rounding mode to default (%mxcsr = 0x1F80) */
743 static
set_SSE_rounding_default(ISelEnv * env)744 void set_SSE_rounding_default ( ISelEnv* env )
745 {
746 /* pushq $DEFAULT_MXCSR
747 ldmxcsr 0(%rsp)
748 addq $8, %rsp
749 */
750 AMD64AMode* zero_rsp = AMD64AMode_IR(0, hregAMD64_RSP());
751 addInstr(env, AMD64Instr_Push(AMD64RMI_Imm(DEFAULT_MXCSR)));
752 addInstr(env, AMD64Instr_LdMXCSR(zero_rsp));
753 add_to_rsp(env, 8);
754 }
755
756 /* Mess with the FPU's rounding mode: set to the default rounding mode
757 (DEFAULT_FPUCW). */
758 static
set_FPU_rounding_default(ISelEnv * env)759 void set_FPU_rounding_default ( ISelEnv* env )
760 {
761 /* movq $DEFAULT_FPUCW, -8(%rsp)
762 fldcw -8(%esp)
763 */
764 AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
765 addInstr(env, AMD64Instr_Alu64M(
766 Aalu_MOV, AMD64RI_Imm(DEFAULT_FPUCW), m8_rsp));
767 addInstr(env, AMD64Instr_A87LdCW(m8_rsp));
768 }
769
770
771 /* Mess with the SSE unit's rounding mode: 'mode' is an I32-typed
772 expression denoting a value in the range 0 .. 3, indicating a round
773 mode encoded as per type IRRoundingMode. Set the SSE machinery to
774 have the same rounding.
775 */
776 static
set_SSE_rounding_mode(ISelEnv * env,IRExpr * mode)777 void set_SSE_rounding_mode ( ISelEnv* env, IRExpr* mode )
778 {
779 /* Note: this sequence only makes sense because DEFAULT_MXCSR has
780 both rounding bits == 0. If that wasn't the case, we couldn't
781 create a new rounding field simply by ORing the new value into
782 place. */
783
784 /* movq $3, %reg
785 andq [[mode]], %reg -- shouldn't be needed; paranoia
786 shlq $13, %reg
787 orq $DEFAULT_MXCSR, %reg
788 pushq %reg
789 ldmxcsr 0(%esp)
790 addq $8, %rsp
791 */
792 HReg reg = newVRegI(env);
793 AMD64AMode* zero_rsp = AMD64AMode_IR(0, hregAMD64_RSP());
794 addInstr(env, AMD64Instr_Alu64R(Aalu_MOV, AMD64RMI_Imm(3), reg));
795 addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
796 iselIntExpr_RMI(env, mode), reg));
797 addInstr(env, AMD64Instr_Sh64(Ash_SHL, 13, reg));
798 addInstr(env, AMD64Instr_Alu64R(
799 Aalu_OR, AMD64RMI_Imm(DEFAULT_MXCSR), reg));
800 addInstr(env, AMD64Instr_Push(AMD64RMI_Reg(reg)));
801 addInstr(env, AMD64Instr_LdMXCSR(zero_rsp));
802 add_to_rsp(env, 8);
803 }
804
805
806 /* Mess with the FPU's rounding mode: 'mode' is an I32-typed
807 expression denoting a value in the range 0 .. 3, indicating a round
808 mode encoded as per type IRRoundingMode. Set the x87 FPU to have
809 the same rounding.
810 */
811 static
set_FPU_rounding_mode(ISelEnv * env,IRExpr * mode)812 void set_FPU_rounding_mode ( ISelEnv* env, IRExpr* mode )
813 {
814 HReg rrm = iselIntExpr_R(env, mode);
815 HReg rrm2 = newVRegI(env);
816 AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
817
818 /* movq %rrm, %rrm2
819 andq $3, %rrm2 -- shouldn't be needed; paranoia
820 shlq $10, %rrm2
821 orq $DEFAULT_FPUCW, %rrm2
822 movq %rrm2, -8(%rsp)
823 fldcw -8(%esp)
824 */
825 addInstr(env, mk_iMOVsd_RR(rrm, rrm2));
826 addInstr(env, AMD64Instr_Alu64R(Aalu_AND, AMD64RMI_Imm(3), rrm2));
827 addInstr(env, AMD64Instr_Sh64(Ash_SHL, 10, rrm2));
828 addInstr(env, AMD64Instr_Alu64R(Aalu_OR,
829 AMD64RMI_Imm(DEFAULT_FPUCW), rrm2));
830 addInstr(env, AMD64Instr_Alu64M(Aalu_MOV,
831 AMD64RI_Reg(rrm2), m8_rsp));
832 addInstr(env, AMD64Instr_A87LdCW(m8_rsp));
833 }
834
835
836 /* Generate all-zeroes into a new vector register.
837 */
generate_zeroes_V128(ISelEnv * env)838 static HReg generate_zeroes_V128 ( ISelEnv* env )
839 {
840 HReg dst = newVRegV(env);
841 addInstr(env, AMD64Instr_SseReRg(Asse_XOR, dst, dst));
842 return dst;
843 }
844
845 /* Generate all-ones into a new vector register.
846 */
generate_ones_V128(ISelEnv * env)847 static HReg generate_ones_V128 ( ISelEnv* env )
848 {
849 HReg dst = newVRegV(env);
850 addInstr(env, AMD64Instr_SseReRg(Asse_CMPEQ32, dst, dst));
851 return dst;
852 }
853
854
855 /* Generate !src into a new vector register. Amazing that there isn't
856 a less crappy way to do this.
857 */
do_sse_NotV128(ISelEnv * env,HReg src)858 static HReg do_sse_NotV128 ( ISelEnv* env, HReg src )
859 {
860 HReg dst = generate_ones_V128(env);
861 addInstr(env, AMD64Instr_SseReRg(Asse_XOR, src, dst));
862 return dst;
863 }
864
865
866 /* Expand the given byte into a 64-bit word, by cloning each bit
867 8 times. */
bitmask8_to_bytemask64(UShort w8)868 static ULong bitmask8_to_bytemask64 ( UShort w8 )
869 {
870 vassert(w8 == (w8 & 0xFF));
871 ULong w64 = 0;
872 Int i;
873 for (i = 0; i < 8; i++) {
874 if (w8 & (1<<i))
875 w64 |= (0xFFULL << (8 * i));
876 }
877 return w64;
878 }
879
880
881 /*---------------------------------------------------------*/
882 /*--- ISEL: Integer expressions (64/32/16/8 bit) ---*/
883 /*---------------------------------------------------------*/
884
885 /* Select insns for an integer-typed expression, and add them to the
886 code list. Return a reg holding the result. This reg will be a
887 virtual register. THE RETURNED REG MUST NOT BE MODIFIED. If you
888 want to modify it, ask for a new vreg, copy it in there, and modify
889 the copy. The register allocator will do its best to map both
890 vregs to the same real register, so the copies will often disappear
891 later in the game.
892
893 This should handle expressions of 64, 32, 16 and 8-bit type. All
894 results are returned in a 64-bit register. For 32-, 16- and 8-bit
895 expressions, the upper 32/48/56 bits are arbitrary, so you should
896 mask or sign extend partial values if necessary.
897 */
898
iselIntExpr_R(ISelEnv * env,IRExpr * e)899 static HReg iselIntExpr_R ( ISelEnv* env, IRExpr* e )
900 {
901 HReg r = iselIntExpr_R_wrk(env, e);
902 /* sanity checks ... */
903 # if 0
904 vex_printf("\niselIntExpr_R: "); ppIRExpr(e); vex_printf("\n");
905 # endif
906 vassert(hregClass(r) == HRcInt64);
907 vassert(hregIsVirtual(r));
908 return r;
909 }
910
911 /* DO NOT CALL THIS DIRECTLY ! */
iselIntExpr_R_wrk(ISelEnv * env,IRExpr * e)912 static HReg iselIntExpr_R_wrk ( ISelEnv* env, IRExpr* e )
913 {
914 /* Used for unary/binary SIMD64 ops. */
915 HWord fn = 0;
916 Bool second_is_UInt;
917
918 MatchInfo mi;
919 DECLARE_PATTERN(p_1Uto8_64to1);
920 DECLARE_PATTERN(p_LDle8_then_8Uto64);
921 DECLARE_PATTERN(p_LDle16_then_16Uto64);
922
923 IRType ty = typeOfIRExpr(env->type_env,e);
924 switch (ty) {
925 case Ity_I64: case Ity_I32: case Ity_I16: case Ity_I8: break;
926 default: vassert(0);
927 }
928
929 switch (e->tag) {
930
931 /* --------- TEMP --------- */
932 case Iex_RdTmp: {
933 return lookupIRTemp(env, e->Iex.RdTmp.tmp);
934 }
935
936 /* --------- LOAD --------- */
937 case Iex_Load: {
938 HReg dst = newVRegI(env);
939 AMD64AMode* amode = iselIntExpr_AMode ( env, e->Iex.Load.addr );
940
941 /* We can't handle big-endian loads, nor load-linked. */
942 if (e->Iex.Load.end != Iend_LE)
943 goto irreducible;
944
945 if (ty == Ity_I64) {
946 addInstr(env, AMD64Instr_Alu64R(Aalu_MOV,
947 AMD64RMI_Mem(amode), dst) );
948 return dst;
949 }
950 if (ty == Ity_I32) {
951 addInstr(env, AMD64Instr_LoadEX(4,False,amode,dst));
952 return dst;
953 }
954 if (ty == Ity_I16) {
955 addInstr(env, AMD64Instr_LoadEX(2,False,amode,dst));
956 return dst;
957 }
958 if (ty == Ity_I8) {
959 addInstr(env, AMD64Instr_LoadEX(1,False,amode,dst));
960 return dst;
961 }
962 break;
963 }
964
965 /* --------- BINARY OP --------- */
966 case Iex_Binop: {
967 AMD64AluOp aluOp;
968 AMD64ShiftOp shOp;
969
970 /* Pattern: Sub64(0,x) */
971 /* and: Sub32(0,x) */
972 if ((e->Iex.Binop.op == Iop_Sub64 && isZeroU64(e->Iex.Binop.arg1))
973 || (e->Iex.Binop.op == Iop_Sub32 && isZeroU32(e->Iex.Binop.arg1))) {
974 HReg dst = newVRegI(env);
975 HReg reg = iselIntExpr_R(env, e->Iex.Binop.arg2);
976 addInstr(env, mk_iMOVsd_RR(reg,dst));
977 addInstr(env, AMD64Instr_Unary64(Aun_NEG,dst));
978 return dst;
979 }
980
981 /* Is it an addition or logical style op? */
982 switch (e->Iex.Binop.op) {
983 case Iop_Add8: case Iop_Add16: case Iop_Add32: case Iop_Add64:
984 aluOp = Aalu_ADD; break;
985 case Iop_Sub8: case Iop_Sub16: case Iop_Sub32: case Iop_Sub64:
986 aluOp = Aalu_SUB; break;
987 case Iop_And8: case Iop_And16: case Iop_And32: case Iop_And64:
988 aluOp = Aalu_AND; break;
989 case Iop_Or8: case Iop_Or16: case Iop_Or32: case Iop_Or64:
990 aluOp = Aalu_OR; break;
991 case Iop_Xor8: case Iop_Xor16: case Iop_Xor32: case Iop_Xor64:
992 aluOp = Aalu_XOR; break;
993 case Iop_Mul16: case Iop_Mul32: case Iop_Mul64:
994 aluOp = Aalu_MUL; break;
995 default:
996 aluOp = Aalu_INVALID; break;
997 }
998 /* For commutative ops we assume any literal
999 values are on the second operand. */
1000 if (aluOp != Aalu_INVALID) {
1001 HReg dst = newVRegI(env);
1002 HReg reg = iselIntExpr_R(env, e->Iex.Binop.arg1);
1003 AMD64RMI* rmi = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
1004 addInstr(env, mk_iMOVsd_RR(reg,dst));
1005 addInstr(env, AMD64Instr_Alu64R(aluOp, rmi, dst));
1006 return dst;
1007 }
1008
1009 /* Perhaps a shift op? */
1010 switch (e->Iex.Binop.op) {
1011 case Iop_Shl64: case Iop_Shl32: case Iop_Shl16: case Iop_Shl8:
1012 shOp = Ash_SHL; break;
1013 case Iop_Shr64: case Iop_Shr32: case Iop_Shr16: case Iop_Shr8:
1014 shOp = Ash_SHR; break;
1015 case Iop_Sar64: case Iop_Sar32: case Iop_Sar16: case Iop_Sar8:
1016 shOp = Ash_SAR; break;
1017 default:
1018 shOp = Ash_INVALID; break;
1019 }
1020 if (shOp != Ash_INVALID) {
1021 HReg dst = newVRegI(env);
1022
1023 /* regL = the value to be shifted */
1024 HReg regL = iselIntExpr_R(env, e->Iex.Binop.arg1);
1025 addInstr(env, mk_iMOVsd_RR(regL,dst));
1026
1027 /* Do any necessary widening for 32/16/8 bit operands */
1028 switch (e->Iex.Binop.op) {
1029 case Iop_Shr64: case Iop_Shl64: case Iop_Sar64:
1030 break;
1031 case Iop_Shl32: case Iop_Shl16: case Iop_Shl8:
1032 break;
1033 case Iop_Shr8:
1034 addInstr(env, AMD64Instr_Alu64R(
1035 Aalu_AND, AMD64RMI_Imm(0xFF), dst));
1036 break;
1037 case Iop_Shr16:
1038 addInstr(env, AMD64Instr_Alu64R(
1039 Aalu_AND, AMD64RMI_Imm(0xFFFF), dst));
1040 break;
1041 case Iop_Shr32:
1042 addInstr(env, AMD64Instr_MovxLQ(False, dst, dst));
1043 break;
1044 case Iop_Sar8:
1045 addInstr(env, AMD64Instr_Sh64(Ash_SHL, 56, dst));
1046 addInstr(env, AMD64Instr_Sh64(Ash_SAR, 56, dst));
1047 break;
1048 case Iop_Sar16:
1049 addInstr(env, AMD64Instr_Sh64(Ash_SHL, 48, dst));
1050 addInstr(env, AMD64Instr_Sh64(Ash_SAR, 48, dst));
1051 break;
1052 case Iop_Sar32:
1053 addInstr(env, AMD64Instr_MovxLQ(True, dst, dst));
1054 break;
1055 default:
1056 ppIROp(e->Iex.Binop.op);
1057 vassert(0);
1058 }
1059
1060 /* Now consider the shift amount. If it's a literal, we
1061 can do a much better job than the general case. */
1062 if (e->Iex.Binop.arg2->tag == Iex_Const) {
1063 /* assert that the IR is well-typed */
1064 Int nshift;
1065 vassert(e->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U8);
1066 nshift = e->Iex.Binop.arg2->Iex.Const.con->Ico.U8;
1067 vassert(nshift >= 0);
1068 if (nshift > 0)
1069 /* Can't allow nshift==0 since that means %cl */
1070 addInstr(env, AMD64Instr_Sh64(shOp, nshift, dst));
1071 } else {
1072 /* General case; we have to force the amount into %cl. */
1073 HReg regR = iselIntExpr_R(env, e->Iex.Binop.arg2);
1074 addInstr(env, mk_iMOVsd_RR(regR,hregAMD64_RCX()));
1075 addInstr(env, AMD64Instr_Sh64(shOp, 0/* %cl */, dst));
1076 }
1077 return dst;
1078 }
1079
1080 /* Deal with 64-bit SIMD binary ops */
1081 second_is_UInt = False;
1082 switch (e->Iex.Binop.op) {
1083 case Iop_Add8x8:
1084 fn = (HWord)h_generic_calc_Add8x8; break;
1085 case Iop_Add16x4:
1086 fn = (HWord)h_generic_calc_Add16x4; break;
1087 case Iop_Add32x2:
1088 fn = (HWord)h_generic_calc_Add32x2; break;
1089
1090 case Iop_Avg8Ux8:
1091 fn = (HWord)h_generic_calc_Avg8Ux8; break;
1092 case Iop_Avg16Ux4:
1093 fn = (HWord)h_generic_calc_Avg16Ux4; break;
1094
1095 case Iop_CmpEQ8x8:
1096 fn = (HWord)h_generic_calc_CmpEQ8x8; break;
1097 case Iop_CmpEQ16x4:
1098 fn = (HWord)h_generic_calc_CmpEQ16x4; break;
1099 case Iop_CmpEQ32x2:
1100 fn = (HWord)h_generic_calc_CmpEQ32x2; break;
1101
1102 case Iop_CmpGT8Sx8:
1103 fn = (HWord)h_generic_calc_CmpGT8Sx8; break;
1104 case Iop_CmpGT16Sx4:
1105 fn = (HWord)h_generic_calc_CmpGT16Sx4; break;
1106 case Iop_CmpGT32Sx2:
1107 fn = (HWord)h_generic_calc_CmpGT32Sx2; break;
1108
1109 case Iop_InterleaveHI8x8:
1110 fn = (HWord)h_generic_calc_InterleaveHI8x8; break;
1111 case Iop_InterleaveLO8x8:
1112 fn = (HWord)h_generic_calc_InterleaveLO8x8; break;
1113 case Iop_InterleaveHI16x4:
1114 fn = (HWord)h_generic_calc_InterleaveHI16x4; break;
1115 case Iop_InterleaveLO16x4:
1116 fn = (HWord)h_generic_calc_InterleaveLO16x4; break;
1117 case Iop_InterleaveHI32x2:
1118 fn = (HWord)h_generic_calc_InterleaveHI32x2; break;
1119 case Iop_InterleaveLO32x2:
1120 fn = (HWord)h_generic_calc_InterleaveLO32x2; break;
1121 case Iop_CatOddLanes16x4:
1122 fn = (HWord)h_generic_calc_CatOddLanes16x4; break;
1123 case Iop_CatEvenLanes16x4:
1124 fn = (HWord)h_generic_calc_CatEvenLanes16x4; break;
1125 case Iop_Perm8x8:
1126 fn = (HWord)h_generic_calc_Perm8x8; break;
1127
1128 case Iop_Max8Ux8:
1129 fn = (HWord)h_generic_calc_Max8Ux8; break;
1130 case Iop_Max16Sx4:
1131 fn = (HWord)h_generic_calc_Max16Sx4; break;
1132 case Iop_Min8Ux8:
1133 fn = (HWord)h_generic_calc_Min8Ux8; break;
1134 case Iop_Min16Sx4:
1135 fn = (HWord)h_generic_calc_Min16Sx4; break;
1136
1137 case Iop_Mul16x4:
1138 fn = (HWord)h_generic_calc_Mul16x4; break;
1139 case Iop_Mul32x2:
1140 fn = (HWord)h_generic_calc_Mul32x2; break;
1141 case Iop_MulHi16Sx4:
1142 fn = (HWord)h_generic_calc_MulHi16Sx4; break;
1143 case Iop_MulHi16Ux4:
1144 fn = (HWord)h_generic_calc_MulHi16Ux4; break;
1145
1146 case Iop_QAdd8Sx8:
1147 fn = (HWord)h_generic_calc_QAdd8Sx8; break;
1148 case Iop_QAdd16Sx4:
1149 fn = (HWord)h_generic_calc_QAdd16Sx4; break;
1150 case Iop_QAdd8Ux8:
1151 fn = (HWord)h_generic_calc_QAdd8Ux8; break;
1152 case Iop_QAdd16Ux4:
1153 fn = (HWord)h_generic_calc_QAdd16Ux4; break;
1154
1155 case Iop_QNarrowBin32Sto16Sx4:
1156 fn = (HWord)h_generic_calc_QNarrowBin32Sto16Sx4; break;
1157 case Iop_QNarrowBin16Sto8Sx8:
1158 fn = (HWord)h_generic_calc_QNarrowBin16Sto8Sx8; break;
1159 case Iop_QNarrowBin16Sto8Ux8:
1160 fn = (HWord)h_generic_calc_QNarrowBin16Sto8Ux8; break;
1161 case Iop_NarrowBin16to8x8:
1162 fn = (HWord)h_generic_calc_NarrowBin16to8x8; break;
1163 case Iop_NarrowBin32to16x4:
1164 fn = (HWord)h_generic_calc_NarrowBin32to16x4; break;
1165
1166 case Iop_QSub8Sx8:
1167 fn = (HWord)h_generic_calc_QSub8Sx8; break;
1168 case Iop_QSub16Sx4:
1169 fn = (HWord)h_generic_calc_QSub16Sx4; break;
1170 case Iop_QSub8Ux8:
1171 fn = (HWord)h_generic_calc_QSub8Ux8; break;
1172 case Iop_QSub16Ux4:
1173 fn = (HWord)h_generic_calc_QSub16Ux4; break;
1174
1175 case Iop_Sub8x8:
1176 fn = (HWord)h_generic_calc_Sub8x8; break;
1177 case Iop_Sub16x4:
1178 fn = (HWord)h_generic_calc_Sub16x4; break;
1179 case Iop_Sub32x2:
1180 fn = (HWord)h_generic_calc_Sub32x2; break;
1181
1182 case Iop_ShlN32x2:
1183 fn = (HWord)h_generic_calc_ShlN32x2;
1184 second_is_UInt = True;
1185 break;
1186 case Iop_ShlN16x4:
1187 fn = (HWord)h_generic_calc_ShlN16x4;
1188 second_is_UInt = True;
1189 break;
1190 case Iop_ShlN8x8:
1191 fn = (HWord)h_generic_calc_ShlN8x8;
1192 second_is_UInt = True;
1193 break;
1194 case Iop_ShrN32x2:
1195 fn = (HWord)h_generic_calc_ShrN32x2;
1196 second_is_UInt = True;
1197 break;
1198 case Iop_ShrN16x4:
1199 fn = (HWord)h_generic_calc_ShrN16x4;
1200 second_is_UInt = True;
1201 break;
1202 case Iop_SarN32x2:
1203 fn = (HWord)h_generic_calc_SarN32x2;
1204 second_is_UInt = True;
1205 break;
1206 case Iop_SarN16x4:
1207 fn = (HWord)h_generic_calc_SarN16x4;
1208 second_is_UInt = True;
1209 break;
1210 case Iop_SarN8x8:
1211 fn = (HWord)h_generic_calc_SarN8x8;
1212 second_is_UInt = True;
1213 break;
1214
1215 default:
1216 fn = (HWord)0; break;
1217 }
1218 if (fn != (HWord)0) {
1219 /* Note: the following assumes all helpers are of signature
1220 ULong fn ( ULong, ULong ), and they are
1221 not marked as regparm functions.
1222 */
1223 HReg dst = newVRegI(env);
1224 HReg argL = iselIntExpr_R(env, e->Iex.Binop.arg1);
1225 HReg argR = iselIntExpr_R(env, e->Iex.Binop.arg2);
1226 if (second_is_UInt)
1227 addInstr(env, AMD64Instr_MovxLQ(False, argR, argR));
1228 addInstr(env, mk_iMOVsd_RR(argL, hregAMD64_RDI()) );
1229 addInstr(env, mk_iMOVsd_RR(argR, hregAMD64_RSI()) );
1230 addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 2,
1231 mk_RetLoc_simple(RLPri_Int) ));
1232 addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(), dst));
1233 return dst;
1234 }
1235
1236 /* Handle misc other ops. */
1237
1238 if (e->Iex.Binop.op == Iop_Max32U) {
1239 HReg src1 = iselIntExpr_R(env, e->Iex.Binop.arg1);
1240 HReg dst = newVRegI(env);
1241 HReg src2 = iselIntExpr_R(env, e->Iex.Binop.arg2);
1242 addInstr(env, mk_iMOVsd_RR(src1, dst));
1243 addInstr(env, AMD64Instr_Alu32R(Aalu_CMP, AMD64RMI_Reg(src2), dst));
1244 addInstr(env, AMD64Instr_CMov64(Acc_B, src2, dst));
1245 return dst;
1246 }
1247
1248 if (e->Iex.Binop.op == Iop_DivModS64to32
1249 || e->Iex.Binop.op == Iop_DivModU64to32) {
1250 /* 64 x 32 -> (32(rem),32(div)) division */
1251 /* Get the 64-bit operand into edx:eax, and the other into
1252 any old R/M. */
1253 HReg rax = hregAMD64_RAX();
1254 HReg rdx = hregAMD64_RDX();
1255 HReg dst = newVRegI(env);
1256 Bool syned = toBool(e->Iex.Binop.op == Iop_DivModS64to32);
1257 AMD64RM* rmRight = iselIntExpr_RM(env, e->Iex.Binop.arg2);
1258 /* Compute the left operand into a reg, and then
1259 put the top half in edx and the bottom in eax. */
1260 HReg left64 = iselIntExpr_R(env, e->Iex.Binop.arg1);
1261 addInstr(env, mk_iMOVsd_RR(left64, rdx));
1262 addInstr(env, mk_iMOVsd_RR(left64, rax));
1263 addInstr(env, AMD64Instr_Sh64(Ash_SHR, 32, rdx));
1264 addInstr(env, AMD64Instr_Div(syned, 4, rmRight));
1265 addInstr(env, AMD64Instr_MovxLQ(False, rdx, rdx));
1266 addInstr(env, AMD64Instr_MovxLQ(False, rax, rax));
1267 addInstr(env, AMD64Instr_Sh64(Ash_SHL, 32, rdx));
1268 addInstr(env, mk_iMOVsd_RR(rax, dst));
1269 addInstr(env, AMD64Instr_Alu64R(Aalu_OR, AMD64RMI_Reg(rdx), dst));
1270 return dst;
1271 }
1272
1273 if (e->Iex.Binop.op == Iop_32HLto64) {
1274 HReg hi32 = newVRegI(env);
1275 HReg lo32 = newVRegI(env);
1276 HReg hi32s = iselIntExpr_R(env, e->Iex.Binop.arg1);
1277 HReg lo32s = iselIntExpr_R(env, e->Iex.Binop.arg2);
1278 addInstr(env, mk_iMOVsd_RR(hi32s, hi32));
1279 addInstr(env, mk_iMOVsd_RR(lo32s, lo32));
1280 addInstr(env, AMD64Instr_Sh64(Ash_SHL, 32, hi32));
1281 addInstr(env, AMD64Instr_MovxLQ(False, lo32, lo32));
1282 addInstr(env, AMD64Instr_Alu64R(
1283 Aalu_OR, AMD64RMI_Reg(lo32), hi32));
1284 return hi32;
1285 }
1286
1287 if (e->Iex.Binop.op == Iop_16HLto32) {
1288 HReg hi16 = newVRegI(env);
1289 HReg lo16 = newVRegI(env);
1290 HReg hi16s = iselIntExpr_R(env, e->Iex.Binop.arg1);
1291 HReg lo16s = iselIntExpr_R(env, e->Iex.Binop.arg2);
1292 addInstr(env, mk_iMOVsd_RR(hi16s, hi16));
1293 addInstr(env, mk_iMOVsd_RR(lo16s, lo16));
1294 addInstr(env, AMD64Instr_Sh64(Ash_SHL, 16, hi16));
1295 addInstr(env, AMD64Instr_Alu64R(
1296 Aalu_AND, AMD64RMI_Imm(0xFFFF), lo16));
1297 addInstr(env, AMD64Instr_Alu64R(
1298 Aalu_OR, AMD64RMI_Reg(lo16), hi16));
1299 return hi16;
1300 }
1301
1302 if (e->Iex.Binop.op == Iop_8HLto16) {
1303 HReg hi8 = newVRegI(env);
1304 HReg lo8 = newVRegI(env);
1305 HReg hi8s = iselIntExpr_R(env, e->Iex.Binop.arg1);
1306 HReg lo8s = iselIntExpr_R(env, e->Iex.Binop.arg2);
1307 addInstr(env, mk_iMOVsd_RR(hi8s, hi8));
1308 addInstr(env, mk_iMOVsd_RR(lo8s, lo8));
1309 addInstr(env, AMD64Instr_Sh64(Ash_SHL, 8, hi8));
1310 addInstr(env, AMD64Instr_Alu64R(
1311 Aalu_AND, AMD64RMI_Imm(0xFF), lo8));
1312 addInstr(env, AMD64Instr_Alu64R(
1313 Aalu_OR, AMD64RMI_Reg(lo8), hi8));
1314 return hi8;
1315 }
1316
1317 if (e->Iex.Binop.op == Iop_MullS32
1318 || e->Iex.Binop.op == Iop_MullS16
1319 || e->Iex.Binop.op == Iop_MullS8
1320 || e->Iex.Binop.op == Iop_MullU32
1321 || e->Iex.Binop.op == Iop_MullU16
1322 || e->Iex.Binop.op == Iop_MullU8) {
1323 HReg a32 = newVRegI(env);
1324 HReg b32 = newVRegI(env);
1325 HReg a32s = iselIntExpr_R(env, e->Iex.Binop.arg1);
1326 HReg b32s = iselIntExpr_R(env, e->Iex.Binop.arg2);
1327 Int shift = 0;
1328 AMD64ShiftOp shr_op = Ash_SHR;
1329 switch (e->Iex.Binop.op) {
1330 case Iop_MullS32: shr_op = Ash_SAR; shift = 32; break;
1331 case Iop_MullS16: shr_op = Ash_SAR; shift = 48; break;
1332 case Iop_MullS8: shr_op = Ash_SAR; shift = 56; break;
1333 case Iop_MullU32: shr_op = Ash_SHR; shift = 32; break;
1334 case Iop_MullU16: shr_op = Ash_SHR; shift = 48; break;
1335 case Iop_MullU8: shr_op = Ash_SHR; shift = 56; break;
1336 default: vassert(0);
1337 }
1338
1339 addInstr(env, mk_iMOVsd_RR(a32s, a32));
1340 addInstr(env, mk_iMOVsd_RR(b32s, b32));
1341 addInstr(env, AMD64Instr_Sh64(Ash_SHL, shift, a32));
1342 addInstr(env, AMD64Instr_Sh64(Ash_SHL, shift, b32));
1343 addInstr(env, AMD64Instr_Sh64(shr_op, shift, a32));
1344 addInstr(env, AMD64Instr_Sh64(shr_op, shift, b32));
1345 addInstr(env, AMD64Instr_Alu64R(Aalu_MUL, AMD64RMI_Reg(a32), b32));
1346 return b32;
1347 }
1348
1349 if (e->Iex.Binop.op == Iop_CmpF64) {
1350 HReg fL = iselDblExpr(env, e->Iex.Binop.arg1);
1351 HReg fR = iselDblExpr(env, e->Iex.Binop.arg2);
1352 HReg dst = newVRegI(env);
1353 addInstr(env, AMD64Instr_SseUComIS(8,fL,fR,dst));
1354 /* Mask out irrelevant parts of the result so as to conform
1355 to the CmpF64 definition. */
1356 addInstr(env, AMD64Instr_Alu64R(Aalu_AND, AMD64RMI_Imm(0x45), dst));
1357 return dst;
1358 }
1359
1360 if (e->Iex.Binop.op == Iop_F64toI32S
1361 || e->Iex.Binop.op == Iop_F64toI64S) {
1362 Int szD = e->Iex.Binop.op==Iop_F64toI32S ? 4 : 8;
1363 HReg rf = iselDblExpr(env, e->Iex.Binop.arg2);
1364 HReg dst = newVRegI(env);
1365 set_SSE_rounding_mode( env, e->Iex.Binop.arg1 );
1366 addInstr(env, AMD64Instr_SseSF2SI( 8, szD, rf, dst ));
1367 set_SSE_rounding_default(env);
1368 return dst;
1369 }
1370
1371 break;
1372 }
1373
1374 /* --------- UNARY OP --------- */
1375 case Iex_Unop: {
1376
1377 /* 1Uto8(64to1(expr64)) */
1378 {
1379 DEFINE_PATTERN( p_1Uto8_64to1,
1380 unop(Iop_1Uto8, unop(Iop_64to1, bind(0))) );
1381 if (matchIRExpr(&mi,p_1Uto8_64to1,e)) {
1382 IRExpr* expr64 = mi.bindee[0];
1383 HReg dst = newVRegI(env);
1384 HReg src = iselIntExpr_R(env, expr64);
1385 addInstr(env, mk_iMOVsd_RR(src,dst) );
1386 addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
1387 AMD64RMI_Imm(1), dst));
1388 return dst;
1389 }
1390 }
1391
1392 /* 8Uto64(LDle(expr64)) */
1393 {
1394 DEFINE_PATTERN(p_LDle8_then_8Uto64,
1395 unop(Iop_8Uto64,
1396 IRExpr_Load(Iend_LE,Ity_I8,bind(0))) );
1397 if (matchIRExpr(&mi,p_LDle8_then_8Uto64,e)) {
1398 HReg dst = newVRegI(env);
1399 AMD64AMode* amode = iselIntExpr_AMode ( env, mi.bindee[0] );
1400 addInstr(env, AMD64Instr_LoadEX(1,False,amode,dst));
1401 return dst;
1402 }
1403 }
1404
1405 /* 16Uto64(LDle(expr64)) */
1406 {
1407 DEFINE_PATTERN(p_LDle16_then_16Uto64,
1408 unop(Iop_16Uto64,
1409 IRExpr_Load(Iend_LE,Ity_I16,bind(0))) );
1410 if (matchIRExpr(&mi,p_LDle16_then_16Uto64,e)) {
1411 HReg dst = newVRegI(env);
1412 AMD64AMode* amode = iselIntExpr_AMode ( env, mi.bindee[0] );
1413 addInstr(env, AMD64Instr_LoadEX(2,False,amode,dst));
1414 return dst;
1415 }
1416 }
1417
1418 /* 32Uto64( Add32/Sub32/And32/Or32/Xor32(expr32, expr32) )
1419 Use 32 bit arithmetic and let the default zero-extend rule
1420 do the 32Uto64 for free. */
1421 if (e->Iex.Unop.op == Iop_32Uto64 && e->Iex.Unop.arg->tag == Iex_Binop) {
1422 IROp opi = e->Iex.Unop.arg->Iex.Binop.op; /* inner op */
1423 IRExpr* argL = e->Iex.Unop.arg->Iex.Binop.arg1;
1424 IRExpr* argR = e->Iex.Unop.arg->Iex.Binop.arg2;
1425 AMD64AluOp aluOp = Aalu_INVALID;
1426 switch (opi) {
1427 case Iop_Add32: aluOp = Aalu_ADD; break;
1428 case Iop_Sub32: aluOp = Aalu_SUB; break;
1429 case Iop_And32: aluOp = Aalu_AND; break;
1430 case Iop_Or32: aluOp = Aalu_OR; break;
1431 case Iop_Xor32: aluOp = Aalu_XOR; break;
1432 default: break;
1433 }
1434 if (aluOp != Aalu_INVALID) {
1435 /* For commutative ops we assume any literal values are on
1436 the second operand. */
1437 HReg dst = newVRegI(env);
1438 HReg reg = iselIntExpr_R(env, argL);
1439 AMD64RMI* rmi = iselIntExpr_RMI(env, argR);
1440 addInstr(env, mk_iMOVsd_RR(reg,dst));
1441 addInstr(env, AMD64Instr_Alu32R(aluOp, rmi, dst));
1442 return dst;
1443 }
1444 /* just fall through to normal handling for Iop_32Uto64 */
1445 }
1446
1447 /* Fallback cases */
1448 switch (e->Iex.Unop.op) {
1449 case Iop_32Uto64:
1450 case Iop_32Sto64: {
1451 HReg dst = newVRegI(env);
1452 HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1453 addInstr(env, AMD64Instr_MovxLQ(e->Iex.Unop.op == Iop_32Sto64,
1454 src, dst) );
1455 return dst;
1456 }
1457 case Iop_128HIto64: {
1458 HReg rHi, rLo;
1459 iselInt128Expr(&rHi,&rLo, env, e->Iex.Unop.arg);
1460 return rHi; /* and abandon rLo */
1461 }
1462 case Iop_128to64: {
1463 HReg rHi, rLo;
1464 iselInt128Expr(&rHi,&rLo, env, e->Iex.Unop.arg);
1465 return rLo; /* and abandon rHi */
1466 }
1467 case Iop_8Uto16:
1468 case Iop_8Uto32:
1469 case Iop_8Uto64:
1470 case Iop_16Uto64:
1471 case Iop_16Uto32: {
1472 HReg dst = newVRegI(env);
1473 HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1474 Bool srcIs16 = toBool( e->Iex.Unop.op==Iop_16Uto32
1475 || e->Iex.Unop.op==Iop_16Uto64 );
1476 UInt mask = srcIs16 ? 0xFFFF : 0xFF;
1477 addInstr(env, mk_iMOVsd_RR(src,dst) );
1478 addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
1479 AMD64RMI_Imm(mask), dst));
1480 return dst;
1481 }
1482 case Iop_8Sto16:
1483 case Iop_8Sto64:
1484 case Iop_8Sto32:
1485 case Iop_16Sto32:
1486 case Iop_16Sto64: {
1487 HReg dst = newVRegI(env);
1488 HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1489 Bool srcIs16 = toBool( e->Iex.Unop.op==Iop_16Sto32
1490 || e->Iex.Unop.op==Iop_16Sto64 );
1491 UInt amt = srcIs16 ? 48 : 56;
1492 addInstr(env, mk_iMOVsd_RR(src,dst) );
1493 addInstr(env, AMD64Instr_Sh64(Ash_SHL, amt, dst));
1494 addInstr(env, AMD64Instr_Sh64(Ash_SAR, amt, dst));
1495 return dst;
1496 }
1497 case Iop_Not8:
1498 case Iop_Not16:
1499 case Iop_Not32:
1500 case Iop_Not64: {
1501 HReg dst = newVRegI(env);
1502 HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1503 addInstr(env, mk_iMOVsd_RR(src,dst) );
1504 addInstr(env, AMD64Instr_Unary64(Aun_NOT,dst));
1505 return dst;
1506 }
1507 case Iop_16HIto8:
1508 case Iop_32HIto16:
1509 case Iop_64HIto32: {
1510 HReg dst = newVRegI(env);
1511 HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1512 Int shift = 0;
1513 switch (e->Iex.Unop.op) {
1514 case Iop_16HIto8: shift = 8; break;
1515 case Iop_32HIto16: shift = 16; break;
1516 case Iop_64HIto32: shift = 32; break;
1517 default: vassert(0);
1518 }
1519 addInstr(env, mk_iMOVsd_RR(src,dst) );
1520 addInstr(env, AMD64Instr_Sh64(Ash_SHR, shift, dst));
1521 return dst;
1522 }
1523 case Iop_1Uto64:
1524 case Iop_1Uto32:
1525 case Iop_1Uto8: {
1526 HReg dst = newVRegI(env);
1527 AMD64CondCode cond = iselCondCode(env, e->Iex.Unop.arg);
1528 addInstr(env, AMD64Instr_Set64(cond,dst));
1529 return dst;
1530 }
1531 case Iop_1Sto8:
1532 case Iop_1Sto16:
1533 case Iop_1Sto32:
1534 case Iop_1Sto64: {
1535 /* could do better than this, but for now ... */
1536 HReg dst = newVRegI(env);
1537 AMD64CondCode cond = iselCondCode(env, e->Iex.Unop.arg);
1538 addInstr(env, AMD64Instr_Set64(cond,dst));
1539 addInstr(env, AMD64Instr_Sh64(Ash_SHL, 63, dst));
1540 addInstr(env, AMD64Instr_Sh64(Ash_SAR, 63, dst));
1541 return dst;
1542 }
1543 case Iop_Ctz64: {
1544 /* Count trailing zeroes, implemented by amd64 'bsfq' */
1545 HReg dst = newVRegI(env);
1546 HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1547 addInstr(env, AMD64Instr_Bsfr64(True,src,dst));
1548 return dst;
1549 }
1550 case Iop_Clz64: {
1551 /* Count leading zeroes. Do 'bsrq' to establish the index
1552 of the highest set bit, and subtract that value from
1553 63. */
1554 HReg tmp = newVRegI(env);
1555 HReg dst = newVRegI(env);
1556 HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1557 addInstr(env, AMD64Instr_Bsfr64(False,src,tmp));
1558 addInstr(env, AMD64Instr_Alu64R(Aalu_MOV,
1559 AMD64RMI_Imm(63), dst));
1560 addInstr(env, AMD64Instr_Alu64R(Aalu_SUB,
1561 AMD64RMI_Reg(tmp), dst));
1562 return dst;
1563 }
1564
1565 case Iop_CmpwNEZ64: {
1566 HReg dst = newVRegI(env);
1567 HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1568 addInstr(env, mk_iMOVsd_RR(src,dst));
1569 addInstr(env, AMD64Instr_Unary64(Aun_NEG,dst));
1570 addInstr(env, AMD64Instr_Alu64R(Aalu_OR,
1571 AMD64RMI_Reg(src), dst));
1572 addInstr(env, AMD64Instr_Sh64(Ash_SAR, 63, dst));
1573 return dst;
1574 }
1575
1576 case Iop_CmpwNEZ32: {
1577 HReg src = newVRegI(env);
1578 HReg dst = newVRegI(env);
1579 HReg pre = iselIntExpr_R(env, e->Iex.Unop.arg);
1580 addInstr(env, mk_iMOVsd_RR(pre,src));
1581 addInstr(env, AMD64Instr_MovxLQ(False, src, src));
1582 addInstr(env, mk_iMOVsd_RR(src,dst));
1583 addInstr(env, AMD64Instr_Unary64(Aun_NEG,dst));
1584 addInstr(env, AMD64Instr_Alu64R(Aalu_OR,
1585 AMD64RMI_Reg(src), dst));
1586 addInstr(env, AMD64Instr_Sh64(Ash_SAR, 63, dst));
1587 return dst;
1588 }
1589
1590 case Iop_Left8:
1591 case Iop_Left16:
1592 case Iop_Left32:
1593 case Iop_Left64: {
1594 HReg dst = newVRegI(env);
1595 HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1596 addInstr(env, mk_iMOVsd_RR(src, dst));
1597 addInstr(env, AMD64Instr_Unary64(Aun_NEG, dst));
1598 addInstr(env, AMD64Instr_Alu64R(Aalu_OR, AMD64RMI_Reg(src), dst));
1599 return dst;
1600 }
1601
1602 case Iop_V128to32: {
1603 HReg dst = newVRegI(env);
1604 HReg vec = iselVecExpr(env, e->Iex.Unop.arg);
1605 AMD64AMode* rsp_m16 = AMD64AMode_IR(-16, hregAMD64_RSP());
1606 addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vec, rsp_m16));
1607 addInstr(env, AMD64Instr_LoadEX(4, False/*z-widen*/, rsp_m16, dst));
1608 return dst;
1609 }
1610
1611 /* V128{HI}to64 */
1612 case Iop_V128HIto64:
1613 case Iop_V128to64: {
1614 HReg dst = newVRegI(env);
1615 Int off = e->Iex.Unop.op==Iop_V128HIto64 ? -8 : -16;
1616 HReg rsp = hregAMD64_RSP();
1617 HReg vec = iselVecExpr(env, e->Iex.Unop.arg);
1618 AMD64AMode* m16_rsp = AMD64AMode_IR(-16, rsp);
1619 AMD64AMode* off_rsp = AMD64AMode_IR(off, rsp);
1620 addInstr(env, AMD64Instr_SseLdSt(False/*store*/,
1621 16, vec, m16_rsp));
1622 addInstr(env, AMD64Instr_Alu64R( Aalu_MOV,
1623 AMD64RMI_Mem(off_rsp), dst ));
1624 return dst;
1625 }
1626
1627 case Iop_V256to64_0: case Iop_V256to64_1:
1628 case Iop_V256to64_2: case Iop_V256to64_3: {
1629 HReg vHi, vLo, vec;
1630 iselDVecExpr(&vHi, &vLo, env, e->Iex.Unop.arg);
1631 /* Do the first part of the selection by deciding which of
1632 the 128 bit registers do look at, and second part using
1633 the same scheme as for V128{HI}to64 above. */
1634 Int off = 0;
1635 switch (e->Iex.Unop.op) {
1636 case Iop_V256to64_0: vec = vLo; off = -16; break;
1637 case Iop_V256to64_1: vec = vLo; off = -8; break;
1638 case Iop_V256to64_2: vec = vHi; off = -16; break;
1639 case Iop_V256to64_3: vec = vHi; off = -8; break;
1640 default: vassert(0);
1641 }
1642 HReg dst = newVRegI(env);
1643 HReg rsp = hregAMD64_RSP();
1644 AMD64AMode* m16_rsp = AMD64AMode_IR(-16, rsp);
1645 AMD64AMode* off_rsp = AMD64AMode_IR(off, rsp);
1646 addInstr(env, AMD64Instr_SseLdSt(False/*store*/,
1647 16, vec, m16_rsp));
1648 addInstr(env, AMD64Instr_Alu64R( Aalu_MOV,
1649 AMD64RMI_Mem(off_rsp), dst ));
1650 return dst;
1651 }
1652
1653 /* ReinterpF64asI64(e) */
1654 /* Given an IEEE754 double, produce an I64 with the same bit
1655 pattern. */
1656 case Iop_ReinterpF64asI64: {
1657 AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
1658 HReg dst = newVRegI(env);
1659 HReg src = iselDblExpr(env, e->Iex.Unop.arg);
1660 /* paranoia */
1661 set_SSE_rounding_default(env);
1662 addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, src, m8_rsp));
1663 addInstr(env, AMD64Instr_Alu64R(
1664 Aalu_MOV, AMD64RMI_Mem(m8_rsp), dst));
1665 return dst;
1666 }
1667
1668 /* ReinterpF32asI32(e) */
1669 /* Given an IEEE754 single, produce an I64 with the same bit
1670 pattern in the lower half. */
1671 case Iop_ReinterpF32asI32: {
1672 AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
1673 HReg dst = newVRegI(env);
1674 HReg src = iselFltExpr(env, e->Iex.Unop.arg);
1675 /* paranoia */
1676 set_SSE_rounding_default(env);
1677 addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 4, src, m8_rsp));
1678 addInstr(env, AMD64Instr_LoadEX(4, False/*unsigned*/, m8_rsp, dst ));
1679 return dst;
1680 }
1681
1682 case Iop_16to8:
1683 case Iop_32to8:
1684 case Iop_64to8:
1685 case Iop_32to16:
1686 case Iop_64to16:
1687 case Iop_64to32:
1688 /* These are no-ops. */
1689 return iselIntExpr_R(env, e->Iex.Unop.arg);
1690
1691 case Iop_GetMSBs8x8: {
1692 /* Note: the following assumes the helper is of
1693 signature
1694 UInt fn ( ULong ), and is not a regparm fn.
1695 */
1696 HReg dst = newVRegI(env);
1697 HReg arg = iselIntExpr_R(env, e->Iex.Unop.arg);
1698 fn = (HWord)h_generic_calc_GetMSBs8x8;
1699 addInstr(env, mk_iMOVsd_RR(arg, hregAMD64_RDI()) );
1700 addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn,
1701 1, mk_RetLoc_simple(RLPri_Int) ));
1702 /* MovxLQ is not exactly the right thing here. We just
1703 need to get the bottom 8 bits of RAX into dst, and zero
1704 out everything else. Assuming that the helper returns
1705 a UInt with the top 24 bits zeroed out, it'll do,
1706 though. */
1707 addInstr(env, AMD64Instr_MovxLQ(False, hregAMD64_RAX(), dst));
1708 return dst;
1709 }
1710
1711 case Iop_GetMSBs8x16: {
1712 /* Note: the following assumes the helper is of signature
1713 UInt fn ( ULong w64hi, ULong w64Lo ),
1714 and is not a regparm fn. */
1715 HReg dst = newVRegI(env);
1716 HReg vec = iselVecExpr(env, e->Iex.Unop.arg);
1717 HReg rsp = hregAMD64_RSP();
1718 fn = (HWord)h_generic_calc_GetMSBs8x16;
1719 AMD64AMode* m8_rsp = AMD64AMode_IR( -8, rsp);
1720 AMD64AMode* m16_rsp = AMD64AMode_IR(-16, rsp);
1721 addInstr(env, AMD64Instr_SseLdSt(False/*store*/,
1722 16, vec, m16_rsp));
1723 /* hi 64 bits into RDI -- the first arg */
1724 addInstr(env, AMD64Instr_Alu64R( Aalu_MOV,
1725 AMD64RMI_Mem(m8_rsp),
1726 hregAMD64_RDI() )); /* 1st arg */
1727 /* lo 64 bits into RSI -- the 2nd arg */
1728 addInstr(env, AMD64Instr_Alu64R( Aalu_MOV,
1729 AMD64RMI_Mem(m16_rsp),
1730 hregAMD64_RSI() )); /* 2nd arg */
1731 addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn,
1732 2, mk_RetLoc_simple(RLPri_Int) ));
1733 /* MovxLQ is not exactly the right thing here. We just
1734 need to get the bottom 16 bits of RAX into dst, and zero
1735 out everything else. Assuming that the helper returns
1736 a UInt with the top 16 bits zeroed out, it'll do,
1737 though. */
1738 addInstr(env, AMD64Instr_MovxLQ(False, hregAMD64_RAX(), dst));
1739 return dst;
1740 }
1741
1742 default:
1743 break;
1744 }
1745
1746 /* Deal with unary 64-bit SIMD ops. */
1747 switch (e->Iex.Unop.op) {
1748 case Iop_CmpNEZ32x2:
1749 fn = (HWord)h_generic_calc_CmpNEZ32x2; break;
1750 case Iop_CmpNEZ16x4:
1751 fn = (HWord)h_generic_calc_CmpNEZ16x4; break;
1752 case Iop_CmpNEZ8x8:
1753 fn = (HWord)h_generic_calc_CmpNEZ8x8; break;
1754 default:
1755 fn = (HWord)0; break;
1756 }
1757 if (fn != (HWord)0) {
1758 /* Note: the following assumes all helpers are of
1759 signature
1760 ULong fn ( ULong ), and they are
1761 not marked as regparm functions.
1762 */
1763 HReg dst = newVRegI(env);
1764 HReg arg = iselIntExpr_R(env, e->Iex.Unop.arg);
1765 addInstr(env, mk_iMOVsd_RR(arg, hregAMD64_RDI()) );
1766 addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 1,
1767 mk_RetLoc_simple(RLPri_Int) ));
1768 addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(), dst));
1769 return dst;
1770 }
1771
1772 break;
1773 }
1774
1775 /* --------- GET --------- */
1776 case Iex_Get: {
1777 if (ty == Ity_I64) {
1778 HReg dst = newVRegI(env);
1779 addInstr(env, AMD64Instr_Alu64R(
1780 Aalu_MOV,
1781 AMD64RMI_Mem(
1782 AMD64AMode_IR(e->Iex.Get.offset,
1783 hregAMD64_RBP())),
1784 dst));
1785 return dst;
1786 }
1787 if (ty == Ity_I8 || ty == Ity_I16 || ty == Ity_I32) {
1788 HReg dst = newVRegI(env);
1789 addInstr(env, AMD64Instr_LoadEX(
1790 toUChar(ty==Ity_I8 ? 1 : (ty==Ity_I16 ? 2 : 4)),
1791 False,
1792 AMD64AMode_IR(e->Iex.Get.offset,hregAMD64_RBP()),
1793 dst));
1794 return dst;
1795 }
1796 break;
1797 }
1798
1799 case Iex_GetI: {
1800 AMD64AMode* am
1801 = genGuestArrayOffset(
1802 env, e->Iex.GetI.descr,
1803 e->Iex.GetI.ix, e->Iex.GetI.bias );
1804 HReg dst = newVRegI(env);
1805 if (ty == Ity_I8) {
1806 addInstr(env, AMD64Instr_LoadEX( 1, False, am, dst ));
1807 return dst;
1808 }
1809 if (ty == Ity_I64) {
1810 addInstr(env, AMD64Instr_Alu64R( Aalu_MOV, AMD64RMI_Mem(am), dst ));
1811 return dst;
1812 }
1813 break;
1814 }
1815
1816 /* --------- CCALL --------- */
1817 case Iex_CCall: {
1818 HReg dst = newVRegI(env);
1819 vassert(ty == e->Iex.CCall.retty);
1820
1821 /* be very restrictive for now. Only 64-bit ints allowed for
1822 args, and 64 or 32 bits for return type. */
1823 if (e->Iex.CCall.retty != Ity_I64 && e->Iex.CCall.retty != Ity_I32)
1824 goto irreducible;
1825
1826 /* Marshal args, do the call. */
1827 UInt addToSp = 0;
1828 RetLoc rloc = mk_RetLoc_INVALID();
1829 doHelperCall( &addToSp, &rloc, env, NULL/*guard*/,
1830 e->Iex.CCall.cee, e->Iex.CCall.retty, e->Iex.CCall.args );
1831 vassert(is_sane_RetLoc(rloc));
1832 vassert(rloc.pri == RLPri_Int);
1833 vassert(addToSp == 0);
1834
1835 /* Move to dst, and zero out the top 32 bits if the result type is
1836 Ity_I32. Probably overkill, but still .. */
1837 if (e->Iex.CCall.retty == Ity_I64)
1838 addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(), dst));
1839 else
1840 addInstr(env, AMD64Instr_MovxLQ(False, hregAMD64_RAX(), dst));
1841
1842 return dst;
1843 }
1844
1845 /* --------- LITERAL --------- */
1846 /* 64/32/16/8-bit literals */
1847 case Iex_Const:
1848 if (ty == Ity_I64) {
1849 HReg r = newVRegI(env);
1850 addInstr(env, AMD64Instr_Imm64(e->Iex.Const.con->Ico.U64, r));
1851 return r;
1852 } else {
1853 AMD64RMI* rmi = iselIntExpr_RMI ( env, e );
1854 HReg r = newVRegI(env);
1855 addInstr(env, AMD64Instr_Alu64R(Aalu_MOV, rmi, r));
1856 return r;
1857 }
1858
1859 /* --------- MULTIPLEX --------- */
1860 case Iex_ITE: { // VFD
1861 if ((ty == Ity_I64 || ty == Ity_I32 || ty == Ity_I16 || ty == Ity_I8)
1862 && typeOfIRExpr(env->type_env,e->Iex.ITE.cond) == Ity_I1) {
1863 HReg r1 = iselIntExpr_R(env, e->Iex.ITE.iftrue);
1864 HReg r0 = iselIntExpr_R(env, e->Iex.ITE.iffalse);
1865 HReg dst = newVRegI(env);
1866 addInstr(env, mk_iMOVsd_RR(r1,dst));
1867 AMD64CondCode cc = iselCondCode(env, e->Iex.ITE.cond);
1868 addInstr(env, AMD64Instr_CMov64(cc ^ 1, r0, dst));
1869 return dst;
1870 }
1871 break;
1872 }
1873
1874 /* --------- TERNARY OP --------- */
1875 case Iex_Triop: {
1876 IRTriop *triop = e->Iex.Triop.details;
1877 /* C3210 flags following FPU partial remainder (fprem), both
1878 IEEE compliant (PREM1) and non-IEEE compliant (PREM). */
1879 if (triop->op == Iop_PRemC3210F64
1880 || triop->op == Iop_PRem1C3210F64) {
1881 AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
1882 HReg arg1 = iselDblExpr(env, triop->arg2);
1883 HReg arg2 = iselDblExpr(env, triop->arg3);
1884 HReg dst = newVRegI(env);
1885 addInstr(env, AMD64Instr_A87Free(2));
1886
1887 /* one arg -> top of x87 stack */
1888 addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, arg2, m8_rsp));
1889 addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
1890
1891 /* other arg -> top of x87 stack */
1892 addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, arg1, m8_rsp));
1893 addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
1894
1895 switch (triop->op) {
1896 case Iop_PRemC3210F64:
1897 addInstr(env, AMD64Instr_A87FpOp(Afp_PREM));
1898 break;
1899 case Iop_PRem1C3210F64:
1900 addInstr(env, AMD64Instr_A87FpOp(Afp_PREM1));
1901 break;
1902 default:
1903 vassert(0);
1904 }
1905 /* Ignore the result, and instead make off with the FPU's
1906 C3210 flags (in the status word). */
1907 addInstr(env, AMD64Instr_A87StSW(m8_rsp));
1908 addInstr(env, AMD64Instr_Alu64R(Aalu_MOV,AMD64RMI_Mem(m8_rsp),dst));
1909 addInstr(env, AMD64Instr_Alu64R(Aalu_AND,AMD64RMI_Imm(0x4700),dst));
1910 return dst;
1911 }
1912 break;
1913 }
1914
1915 default:
1916 break;
1917 } /* switch (e->tag) */
1918
1919 /* We get here if no pattern matched. */
1920 irreducible:
1921 ppIRExpr(e);
1922 vpanic("iselIntExpr_R(amd64): cannot reduce tree");
1923 }
1924
1925
1926 /*---------------------------------------------------------*/
1927 /*--- ISEL: Integer expression auxiliaries ---*/
1928 /*---------------------------------------------------------*/
1929
1930 /* --------------------- AMODEs --------------------- */
1931
1932 /* Return an AMode which computes the value of the specified
1933 expression, possibly also adding insns to the code list as a
1934 result. The expression may only be a 32-bit one.
1935 */
1936
iselIntExpr_AMode(ISelEnv * env,IRExpr * e)1937 static AMD64AMode* iselIntExpr_AMode ( ISelEnv* env, IRExpr* e )
1938 {
1939 AMD64AMode* am = iselIntExpr_AMode_wrk(env, e);
1940 vassert(sane_AMode(am));
1941 return am;
1942 }
1943
1944 /* DO NOT CALL THIS DIRECTLY ! */
iselIntExpr_AMode_wrk(ISelEnv * env,IRExpr * e)1945 static AMD64AMode* iselIntExpr_AMode_wrk ( ISelEnv* env, IRExpr* e )
1946 {
1947 MatchInfo mi;
1948 DECLARE_PATTERN(p_complex);
1949 IRType ty = typeOfIRExpr(env->type_env,e);
1950 vassert(ty == Ity_I64);
1951
1952 /* Add64( Add64(expr1, Shl64(expr2, imm8)), simm32 ) */
1953 /* bind0 bind1 bind2 bind3 */
1954 DEFINE_PATTERN(p_complex,
1955 binop( Iop_Add64,
1956 binop( Iop_Add64,
1957 bind(0),
1958 binop(Iop_Shl64, bind(1), bind(2))
1959 ),
1960 bind(3)
1961 )
1962 );
1963 if (matchIRExpr(&mi, p_complex, e)) {
1964 IRExpr* expr1 = mi.bindee[0];
1965 IRExpr* expr2 = mi.bindee[1];
1966 IRExpr* imm8 = mi.bindee[2];
1967 IRExpr* simm32 = mi.bindee[3];
1968 if (imm8->tag == Iex_Const
1969 && imm8->Iex.Const.con->tag == Ico_U8
1970 && imm8->Iex.Const.con->Ico.U8 < 4
1971 /* imm8 is OK, now check simm32 */
1972 && simm32->tag == Iex_Const
1973 && simm32->Iex.Const.con->tag == Ico_U64
1974 && fitsIn32Bits(simm32->Iex.Const.con->Ico.U64)) {
1975 UInt shift = imm8->Iex.Const.con->Ico.U8;
1976 UInt offset = toUInt(simm32->Iex.Const.con->Ico.U64);
1977 HReg r1 = iselIntExpr_R(env, expr1);
1978 HReg r2 = iselIntExpr_R(env, expr2);
1979 vassert(shift == 0 || shift == 1 || shift == 2 || shift == 3);
1980 return AMD64AMode_IRRS(offset, r1, r2, shift);
1981 }
1982 }
1983
1984 /* Add64(expr1, Shl64(expr2, imm)) */
1985 if (e->tag == Iex_Binop
1986 && e->Iex.Binop.op == Iop_Add64
1987 && e->Iex.Binop.arg2->tag == Iex_Binop
1988 && e->Iex.Binop.arg2->Iex.Binop.op == Iop_Shl64
1989 && e->Iex.Binop.arg2->Iex.Binop.arg2->tag == Iex_Const
1990 && e->Iex.Binop.arg2->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U8) {
1991 UInt shift = e->Iex.Binop.arg2->Iex.Binop.arg2->Iex.Const.con->Ico.U8;
1992 if (shift == 1 || shift == 2 || shift == 3) {
1993 HReg r1 = iselIntExpr_R(env, e->Iex.Binop.arg1);
1994 HReg r2 = iselIntExpr_R(env, e->Iex.Binop.arg2->Iex.Binop.arg1 );
1995 return AMD64AMode_IRRS(0, r1, r2, shift);
1996 }
1997 }
1998
1999 /* Add64(expr,i) */
2000 if (e->tag == Iex_Binop
2001 && e->Iex.Binop.op == Iop_Add64
2002 && e->Iex.Binop.arg2->tag == Iex_Const
2003 && e->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U64
2004 && fitsIn32Bits(e->Iex.Binop.arg2->Iex.Const.con->Ico.U64)) {
2005 HReg r1 = iselIntExpr_R(env, e->Iex.Binop.arg1);
2006 return AMD64AMode_IR(
2007 toUInt(e->Iex.Binop.arg2->Iex.Const.con->Ico.U64),
2008 r1
2009 );
2010 }
2011
2012 /* Doesn't match anything in particular. Generate it into
2013 a register and use that. */
2014 {
2015 HReg r1 = iselIntExpr_R(env, e);
2016 return AMD64AMode_IR(0, r1);
2017 }
2018 }
2019
2020
2021 /* --------------------- RMIs --------------------- */
2022
2023 /* Similarly, calculate an expression into an X86RMI operand. As with
2024 iselIntExpr_R, the expression can have type 32, 16 or 8 bits. */
2025
iselIntExpr_RMI(ISelEnv * env,IRExpr * e)2026 static AMD64RMI* iselIntExpr_RMI ( ISelEnv* env, IRExpr* e )
2027 {
2028 AMD64RMI* rmi = iselIntExpr_RMI_wrk(env, e);
2029 /* sanity checks ... */
2030 switch (rmi->tag) {
2031 case Armi_Imm:
2032 return rmi;
2033 case Armi_Reg:
2034 vassert(hregClass(rmi->Armi.Reg.reg) == HRcInt64);
2035 vassert(hregIsVirtual(rmi->Armi.Reg.reg));
2036 return rmi;
2037 case Armi_Mem:
2038 vassert(sane_AMode(rmi->Armi.Mem.am));
2039 return rmi;
2040 default:
2041 vpanic("iselIntExpr_RMI: unknown amd64 RMI tag");
2042 }
2043 }
2044
2045 /* DO NOT CALL THIS DIRECTLY ! */
iselIntExpr_RMI_wrk(ISelEnv * env,IRExpr * e)2046 static AMD64RMI* iselIntExpr_RMI_wrk ( ISelEnv* env, IRExpr* e )
2047 {
2048 IRType ty = typeOfIRExpr(env->type_env,e);
2049 vassert(ty == Ity_I64 || ty == Ity_I32
2050 || ty == Ity_I16 || ty == Ity_I8);
2051
2052 /* special case: immediate 64/32/16/8 */
2053 if (e->tag == Iex_Const) {
2054 switch (e->Iex.Const.con->tag) {
2055 case Ico_U64:
2056 if (fitsIn32Bits(e->Iex.Const.con->Ico.U64)) {
2057 return AMD64RMI_Imm(toUInt(e->Iex.Const.con->Ico.U64));
2058 }
2059 break;
2060 case Ico_U32:
2061 return AMD64RMI_Imm(e->Iex.Const.con->Ico.U32); break;
2062 case Ico_U16:
2063 return AMD64RMI_Imm(0xFFFF & e->Iex.Const.con->Ico.U16); break;
2064 case Ico_U8:
2065 return AMD64RMI_Imm(0xFF & e->Iex.Const.con->Ico.U8); break;
2066 default:
2067 vpanic("iselIntExpr_RMI.Iex_Const(amd64)");
2068 }
2069 }
2070
2071 /* special case: 64-bit GET */
2072 if (e->tag == Iex_Get && ty == Ity_I64) {
2073 return AMD64RMI_Mem(AMD64AMode_IR(e->Iex.Get.offset,
2074 hregAMD64_RBP()));
2075 }
2076
2077 /* special case: 64-bit load from memory */
2078 if (e->tag == Iex_Load && ty == Ity_I64
2079 && e->Iex.Load.end == Iend_LE) {
2080 AMD64AMode* am = iselIntExpr_AMode(env, e->Iex.Load.addr);
2081 return AMD64RMI_Mem(am);
2082 }
2083
2084 /* default case: calculate into a register and return that */
2085 {
2086 HReg r = iselIntExpr_R ( env, e );
2087 return AMD64RMI_Reg(r);
2088 }
2089 }
2090
2091
2092 /* --------------------- RIs --------------------- */
2093
2094 /* Calculate an expression into an AMD64RI operand. As with
2095 iselIntExpr_R, the expression can have type 64, 32, 16 or 8
2096 bits. */
2097
iselIntExpr_RI(ISelEnv * env,IRExpr * e)2098 static AMD64RI* iselIntExpr_RI ( ISelEnv* env, IRExpr* e )
2099 {
2100 AMD64RI* ri = iselIntExpr_RI_wrk(env, e);
2101 /* sanity checks ... */
2102 switch (ri->tag) {
2103 case Ari_Imm:
2104 return ri;
2105 case Ari_Reg:
2106 vassert(hregClass(ri->Ari.Reg.reg) == HRcInt64);
2107 vassert(hregIsVirtual(ri->Ari.Reg.reg));
2108 return ri;
2109 default:
2110 vpanic("iselIntExpr_RI: unknown amd64 RI tag");
2111 }
2112 }
2113
2114 /* DO NOT CALL THIS DIRECTLY ! */
iselIntExpr_RI_wrk(ISelEnv * env,IRExpr * e)2115 static AMD64RI* iselIntExpr_RI_wrk ( ISelEnv* env, IRExpr* e )
2116 {
2117 IRType ty = typeOfIRExpr(env->type_env,e);
2118 vassert(ty == Ity_I64 || ty == Ity_I32
2119 || ty == Ity_I16 || ty == Ity_I8);
2120
2121 /* special case: immediate */
2122 if (e->tag == Iex_Const) {
2123 switch (e->Iex.Const.con->tag) {
2124 case Ico_U64:
2125 if (fitsIn32Bits(e->Iex.Const.con->Ico.U64)) {
2126 return AMD64RI_Imm(toUInt(e->Iex.Const.con->Ico.U64));
2127 }
2128 break;
2129 case Ico_U32:
2130 return AMD64RI_Imm(e->Iex.Const.con->Ico.U32);
2131 case Ico_U16:
2132 return AMD64RI_Imm(0xFFFF & e->Iex.Const.con->Ico.U16);
2133 case Ico_U8:
2134 return AMD64RI_Imm(0xFF & e->Iex.Const.con->Ico.U8);
2135 default:
2136 vpanic("iselIntExpr_RMI.Iex_Const(amd64)");
2137 }
2138 }
2139
2140 /* default case: calculate into a register and return that */
2141 {
2142 HReg r = iselIntExpr_R ( env, e );
2143 return AMD64RI_Reg(r);
2144 }
2145 }
2146
2147
2148 /* --------------------- RMs --------------------- */
2149
2150 /* Similarly, calculate an expression into an AMD64RM operand. As
2151 with iselIntExpr_R, the expression can have type 64, 32, 16 or 8
2152 bits. */
2153
iselIntExpr_RM(ISelEnv * env,IRExpr * e)2154 static AMD64RM* iselIntExpr_RM ( ISelEnv* env, IRExpr* e )
2155 {
2156 AMD64RM* rm = iselIntExpr_RM_wrk(env, e);
2157 /* sanity checks ... */
2158 switch (rm->tag) {
2159 case Arm_Reg:
2160 vassert(hregClass(rm->Arm.Reg.reg) == HRcInt64);
2161 vassert(hregIsVirtual(rm->Arm.Reg.reg));
2162 return rm;
2163 case Arm_Mem:
2164 vassert(sane_AMode(rm->Arm.Mem.am));
2165 return rm;
2166 default:
2167 vpanic("iselIntExpr_RM: unknown amd64 RM tag");
2168 }
2169 }
2170
2171 /* DO NOT CALL THIS DIRECTLY ! */
iselIntExpr_RM_wrk(ISelEnv * env,IRExpr * e)2172 static AMD64RM* iselIntExpr_RM_wrk ( ISelEnv* env, IRExpr* e )
2173 {
2174 IRType ty = typeOfIRExpr(env->type_env,e);
2175 vassert(ty == Ity_I64 || ty == Ity_I32 || ty == Ity_I16 || ty == Ity_I8);
2176
2177 /* special case: 64-bit GET */
2178 if (e->tag == Iex_Get && ty == Ity_I64) {
2179 return AMD64RM_Mem(AMD64AMode_IR(e->Iex.Get.offset,
2180 hregAMD64_RBP()));
2181 }
2182
2183 /* special case: load from memory */
2184
2185 /* default case: calculate into a register and return that */
2186 {
2187 HReg r = iselIntExpr_R ( env, e );
2188 return AMD64RM_Reg(r);
2189 }
2190 }
2191
2192
2193 /* --------------------- CONDCODE --------------------- */
2194
2195 /* Generate code to evaluated a bit-typed expression, returning the
2196 condition code which would correspond when the expression would
2197 notionally have returned 1. */
2198
iselCondCode(ISelEnv * env,IRExpr * e)2199 static AMD64CondCode iselCondCode ( ISelEnv* env, IRExpr* e )
2200 {
2201 /* Uh, there's nothing we can sanity check here, unfortunately. */
2202 return iselCondCode_wrk(env,e);
2203 }
2204
2205 /* DO NOT CALL THIS DIRECTLY ! */
iselCondCode_wrk(ISelEnv * env,IRExpr * e)2206 static AMD64CondCode iselCondCode_wrk ( ISelEnv* env, IRExpr* e )
2207 {
2208 MatchInfo mi;
2209
2210 vassert(e);
2211 vassert(typeOfIRExpr(env->type_env,e) == Ity_I1);
2212
2213 /* var */
2214 if (e->tag == Iex_RdTmp) {
2215 HReg r64 = lookupIRTemp(env, e->Iex.RdTmp.tmp);
2216 HReg dst = newVRegI(env);
2217 addInstr(env, mk_iMOVsd_RR(r64,dst));
2218 addInstr(env, AMD64Instr_Alu64R(Aalu_AND,AMD64RMI_Imm(1),dst));
2219 return Acc_NZ;
2220 }
2221
2222 /* Constant 1:Bit */
2223 if (e->tag == Iex_Const) {
2224 HReg r;
2225 vassert(e->Iex.Const.con->tag == Ico_U1);
2226 vassert(e->Iex.Const.con->Ico.U1 == True
2227 || e->Iex.Const.con->Ico.U1 == False);
2228 r = newVRegI(env);
2229 addInstr(env, AMD64Instr_Alu64R(Aalu_MOV,AMD64RMI_Imm(0),r));
2230 addInstr(env, AMD64Instr_Alu64R(Aalu_XOR,AMD64RMI_Reg(r),r));
2231 return e->Iex.Const.con->Ico.U1 ? Acc_Z : Acc_NZ;
2232 }
2233
2234 /* Not1(...) */
2235 if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_Not1) {
2236 /* Generate code for the arg, and negate the test condition */
2237 return 1 ^ iselCondCode(env, e->Iex.Unop.arg);
2238 }
2239
2240 /* --- patterns rooted at: 64to1 --- */
2241
2242 /* 64to1 */
2243 if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_64to1) {
2244 HReg reg = iselIntExpr_R(env, e->Iex.Unop.arg);
2245 addInstr(env, AMD64Instr_Test64(1,reg));
2246 return Acc_NZ;
2247 }
2248
2249 /* --- patterns rooted at: 32to1 --- */
2250
2251 /* 32to1 */
2252 if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_32to1) {
2253 HReg reg = iselIntExpr_R(env, e->Iex.Unop.arg);
2254 addInstr(env, AMD64Instr_Test64(1,reg));
2255 return Acc_NZ;
2256 }
2257
2258 /* --- patterns rooted at: CmpNEZ8 --- */
2259
2260 /* CmpNEZ8(x) */
2261 if (e->tag == Iex_Unop
2262 && e->Iex.Unop.op == Iop_CmpNEZ8) {
2263 HReg r = iselIntExpr_R(env, e->Iex.Unop.arg);
2264 addInstr(env, AMD64Instr_Test64(0xFF,r));
2265 return Acc_NZ;
2266 }
2267
2268 /* --- patterns rooted at: CmpNEZ16 --- */
2269
2270 /* CmpNEZ16(x) */
2271 if (e->tag == Iex_Unop
2272 && e->Iex.Unop.op == Iop_CmpNEZ16) {
2273 HReg r = iselIntExpr_R(env, e->Iex.Unop.arg);
2274 addInstr(env, AMD64Instr_Test64(0xFFFF,r));
2275 return Acc_NZ;
2276 }
2277
2278 /* --- patterns rooted at: CmpNEZ32 --- */
2279
2280 /* CmpNEZ32(x) */
2281 if (e->tag == Iex_Unop
2282 && e->Iex.Unop.op == Iop_CmpNEZ32) {
2283 HReg r1 = iselIntExpr_R(env, e->Iex.Unop.arg);
2284 AMD64RMI* rmi2 = AMD64RMI_Imm(0);
2285 addInstr(env, AMD64Instr_Alu32R(Aalu_CMP,rmi2,r1));
2286 return Acc_NZ;
2287 }
2288
2289 /* --- patterns rooted at: CmpNEZ64 --- */
2290
2291 /* CmpNEZ64(Or64(x,y)) */
2292 {
2293 DECLARE_PATTERN(p_CmpNEZ64_Or64);
2294 DEFINE_PATTERN(p_CmpNEZ64_Or64,
2295 unop(Iop_CmpNEZ64, binop(Iop_Or64, bind(0), bind(1))));
2296 if (matchIRExpr(&mi, p_CmpNEZ64_Or64, e)) {
2297 HReg r0 = iselIntExpr_R(env, mi.bindee[0]);
2298 AMD64RMI* rmi1 = iselIntExpr_RMI(env, mi.bindee[1]);
2299 HReg tmp = newVRegI(env);
2300 addInstr(env, mk_iMOVsd_RR(r0, tmp));
2301 addInstr(env, AMD64Instr_Alu64R(Aalu_OR,rmi1,tmp));
2302 return Acc_NZ;
2303 }
2304 }
2305
2306 /* CmpNEZ64(x) */
2307 if (e->tag == Iex_Unop
2308 && e->Iex.Unop.op == Iop_CmpNEZ64) {
2309 HReg r1 = iselIntExpr_R(env, e->Iex.Unop.arg);
2310 AMD64RMI* rmi2 = AMD64RMI_Imm(0);
2311 addInstr(env, AMD64Instr_Alu64R(Aalu_CMP,rmi2,r1));
2312 return Acc_NZ;
2313 }
2314
2315 /* --- patterns rooted at: Cmp{EQ,NE}{8,16,32} --- */
2316
2317 /* CmpEQ8 / CmpNE8 */
2318 if (e->tag == Iex_Binop
2319 && (e->Iex.Binop.op == Iop_CmpEQ8
2320 || e->Iex.Binop.op == Iop_CmpNE8
2321 || e->Iex.Binop.op == Iop_CasCmpEQ8
2322 || e->Iex.Binop.op == Iop_CasCmpNE8)) {
2323 if (isZeroU8(e->Iex.Binop.arg2)) {
2324 HReg r1 = iselIntExpr_R(env, e->Iex.Binop.arg1);
2325 addInstr(env, AMD64Instr_Test64(0xFF,r1));
2326 switch (e->Iex.Binop.op) {
2327 case Iop_CmpEQ8: case Iop_CasCmpEQ8: return Acc_Z;
2328 case Iop_CmpNE8: case Iop_CasCmpNE8: return Acc_NZ;
2329 default: vpanic("iselCondCode(amd64): CmpXX8(expr,0:I8)");
2330 }
2331 } else {
2332 HReg r1 = iselIntExpr_R(env, e->Iex.Binop.arg1);
2333 AMD64RMI* rmi2 = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
2334 HReg r = newVRegI(env);
2335 addInstr(env, mk_iMOVsd_RR(r1,r));
2336 addInstr(env, AMD64Instr_Alu64R(Aalu_XOR,rmi2,r));
2337 addInstr(env, AMD64Instr_Alu64R(Aalu_AND,AMD64RMI_Imm(0xFF),r));
2338 switch (e->Iex.Binop.op) {
2339 case Iop_CmpEQ8: case Iop_CasCmpEQ8: return Acc_Z;
2340 case Iop_CmpNE8: case Iop_CasCmpNE8: return Acc_NZ;
2341 default: vpanic("iselCondCode(amd64): CmpXX8(expr,expr)");
2342 }
2343 }
2344 }
2345
2346 /* CmpEQ16 / CmpNE16 */
2347 if (e->tag == Iex_Binop
2348 && (e->Iex.Binop.op == Iop_CmpEQ16
2349 || e->Iex.Binop.op == Iop_CmpNE16
2350 || e->Iex.Binop.op == Iop_CasCmpEQ16
2351 || e->Iex.Binop.op == Iop_CasCmpNE16)) {
2352 HReg r1 = iselIntExpr_R(env, e->Iex.Binop.arg1);
2353 AMD64RMI* rmi2 = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
2354 HReg r = newVRegI(env);
2355 addInstr(env, mk_iMOVsd_RR(r1,r));
2356 addInstr(env, AMD64Instr_Alu64R(Aalu_XOR,rmi2,r));
2357 addInstr(env, AMD64Instr_Alu64R(Aalu_AND,AMD64RMI_Imm(0xFFFF),r));
2358 switch (e->Iex.Binop.op) {
2359 case Iop_CmpEQ16: case Iop_CasCmpEQ16: return Acc_Z;
2360 case Iop_CmpNE16: case Iop_CasCmpNE16: return Acc_NZ;
2361 default: vpanic("iselCondCode(amd64): CmpXX16");
2362 }
2363 }
2364
2365 /* CmpNE64(ccall, 64-bit constant) (--smc-check=all optimisation).
2366 Saves a "movq %rax, %tmp" compared to the default route. */
2367 if (e->tag == Iex_Binop
2368 && e->Iex.Binop.op == Iop_CmpNE64
2369 && e->Iex.Binop.arg1->tag == Iex_CCall
2370 && e->Iex.Binop.arg2->tag == Iex_Const) {
2371 IRExpr* cal = e->Iex.Binop.arg1;
2372 IRExpr* con = e->Iex.Binop.arg2;
2373 HReg tmp = newVRegI(env);
2374 /* clone & partial-eval of generic Iex_CCall and Iex_Const cases */
2375 vassert(cal->Iex.CCall.retty == Ity_I64); /* else ill-typed IR */
2376 vassert(con->Iex.Const.con->tag == Ico_U64);
2377 /* Marshal args, do the call. */
2378 UInt addToSp = 0;
2379 RetLoc rloc = mk_RetLoc_INVALID();
2380 doHelperCall( &addToSp, &rloc, env, NULL/*guard*/,
2381 cal->Iex.CCall.cee,
2382 cal->Iex.CCall.retty, cal->Iex.CCall.args );
2383 vassert(is_sane_RetLoc(rloc));
2384 vassert(rloc.pri == RLPri_Int);
2385 vassert(addToSp == 0);
2386 /* */
2387 addInstr(env, AMD64Instr_Imm64(con->Iex.Const.con->Ico.U64, tmp));
2388 addInstr(env, AMD64Instr_Alu64R(Aalu_CMP,
2389 AMD64RMI_Reg(hregAMD64_RAX()), tmp));
2390 return Acc_NZ;
2391 }
2392
2393 /* Cmp*64*(x,y) */
2394 if (e->tag == Iex_Binop
2395 && (e->Iex.Binop.op == Iop_CmpEQ64
2396 || e->Iex.Binop.op == Iop_CmpNE64
2397 || e->Iex.Binop.op == Iop_CmpLT64S
2398 || e->Iex.Binop.op == Iop_CmpLT64U
2399 || e->Iex.Binop.op == Iop_CmpLE64S
2400 || e->Iex.Binop.op == Iop_CmpLE64U
2401 || e->Iex.Binop.op == Iop_CasCmpEQ64
2402 || e->Iex.Binop.op == Iop_CasCmpNE64
2403 || e->Iex.Binop.op == Iop_ExpCmpNE64)) {
2404 HReg r1 = iselIntExpr_R(env, e->Iex.Binop.arg1);
2405 AMD64RMI* rmi2 = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
2406 addInstr(env, AMD64Instr_Alu64R(Aalu_CMP,rmi2,r1));
2407 switch (e->Iex.Binop.op) {
2408 case Iop_CmpEQ64: case Iop_CasCmpEQ64: return Acc_Z;
2409 case Iop_CmpNE64:
2410 case Iop_CasCmpNE64: case Iop_ExpCmpNE64: return Acc_NZ;
2411 case Iop_CmpLT64S: return Acc_L;
2412 case Iop_CmpLT64U: return Acc_B;
2413 case Iop_CmpLE64S: return Acc_LE;
2414 case Iop_CmpLE64U: return Acc_BE;
2415 default: vpanic("iselCondCode(amd64): CmpXX64");
2416 }
2417 }
2418
2419 /* Cmp*32*(x,y) */
2420 if (e->tag == Iex_Binop
2421 && (e->Iex.Binop.op == Iop_CmpEQ32
2422 || e->Iex.Binop.op == Iop_CmpNE32
2423 || e->Iex.Binop.op == Iop_CmpLT32S
2424 || e->Iex.Binop.op == Iop_CmpLT32U
2425 || e->Iex.Binop.op == Iop_CmpLE32S
2426 || e->Iex.Binop.op == Iop_CmpLE32U
2427 || e->Iex.Binop.op == Iop_CasCmpEQ32
2428 || e->Iex.Binop.op == Iop_CasCmpNE32
2429 || e->Iex.Binop.op == Iop_ExpCmpNE32)) {
2430 HReg r1 = iselIntExpr_R(env, e->Iex.Binop.arg1);
2431 AMD64RMI* rmi2 = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
2432 addInstr(env, AMD64Instr_Alu32R(Aalu_CMP,rmi2,r1));
2433 switch (e->Iex.Binop.op) {
2434 case Iop_CmpEQ32: case Iop_CasCmpEQ32: return Acc_Z;
2435 case Iop_CmpNE32:
2436 case Iop_CasCmpNE32: case Iop_ExpCmpNE32: return Acc_NZ;
2437 case Iop_CmpLT32S: return Acc_L;
2438 case Iop_CmpLT32U: return Acc_B;
2439 case Iop_CmpLE32S: return Acc_LE;
2440 case Iop_CmpLE32U: return Acc_BE;
2441 default: vpanic("iselCondCode(amd64): CmpXX32");
2442 }
2443 }
2444
2445 ppIRExpr(e);
2446 vpanic("iselCondCode(amd64)");
2447 }
2448
2449
2450 /*---------------------------------------------------------*/
2451 /*--- ISEL: Integer expressions (128 bit) ---*/
2452 /*---------------------------------------------------------*/
2453
2454 /* Compute a 128-bit value into a register pair, which is returned as
2455 the first two parameters. As with iselIntExpr_R, these may be
2456 either real or virtual regs; in any case they must not be changed
2457 by subsequent code emitted by the caller. */
2458
iselInt128Expr(HReg * rHi,HReg * rLo,ISelEnv * env,IRExpr * e)2459 static void iselInt128Expr ( HReg* rHi, HReg* rLo,
2460 ISelEnv* env, IRExpr* e )
2461 {
2462 iselInt128Expr_wrk(rHi, rLo, env, e);
2463 # if 0
2464 vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
2465 # endif
2466 vassert(hregClass(*rHi) == HRcInt64);
2467 vassert(hregIsVirtual(*rHi));
2468 vassert(hregClass(*rLo) == HRcInt64);
2469 vassert(hregIsVirtual(*rLo));
2470 }
2471
2472 /* DO NOT CALL THIS DIRECTLY ! */
iselInt128Expr_wrk(HReg * rHi,HReg * rLo,ISelEnv * env,IRExpr * e)2473 static void iselInt128Expr_wrk ( HReg* rHi, HReg* rLo,
2474 ISelEnv* env, IRExpr* e )
2475 {
2476 vassert(e);
2477 vassert(typeOfIRExpr(env->type_env,e) == Ity_I128);
2478
2479 /* read 128-bit IRTemp */
2480 if (e->tag == Iex_RdTmp) {
2481 lookupIRTempPair( rHi, rLo, env, e->Iex.RdTmp.tmp);
2482 return;
2483 }
2484
2485 /* --------- BINARY ops --------- */
2486 if (e->tag == Iex_Binop) {
2487 switch (e->Iex.Binop.op) {
2488 /* 64 x 64 -> 128 multiply */
2489 case Iop_MullU64:
2490 case Iop_MullS64: {
2491 /* get one operand into %rax, and the other into a R/M.
2492 Need to make an educated guess about which is better in
2493 which. */
2494 HReg tLo = newVRegI(env);
2495 HReg tHi = newVRegI(env);
2496 Bool syned = toBool(e->Iex.Binop.op == Iop_MullS64);
2497 AMD64RM* rmLeft = iselIntExpr_RM(env, e->Iex.Binop.arg1);
2498 HReg rRight = iselIntExpr_R(env, e->Iex.Binop.arg2);
2499 addInstr(env, mk_iMOVsd_RR(rRight, hregAMD64_RAX()));
2500 addInstr(env, AMD64Instr_MulL(syned, rmLeft));
2501 /* Result is now in RDX:RAX. Tell the caller. */
2502 addInstr(env, mk_iMOVsd_RR(hregAMD64_RDX(), tHi));
2503 addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(), tLo));
2504 *rHi = tHi;
2505 *rLo = tLo;
2506 return;
2507 }
2508
2509 /* 128 x 64 -> (64(rem),64(div)) division */
2510 case Iop_DivModU128to64:
2511 case Iop_DivModS128to64: {
2512 /* Get the 128-bit operand into rdx:rax, and the other into
2513 any old R/M. */
2514 HReg sHi, sLo;
2515 HReg tLo = newVRegI(env);
2516 HReg tHi = newVRegI(env);
2517 Bool syned = toBool(e->Iex.Binop.op == Iop_DivModS128to64);
2518 AMD64RM* rmRight = iselIntExpr_RM(env, e->Iex.Binop.arg2);
2519 iselInt128Expr(&sHi,&sLo, env, e->Iex.Binop.arg1);
2520 addInstr(env, mk_iMOVsd_RR(sHi, hregAMD64_RDX()));
2521 addInstr(env, mk_iMOVsd_RR(sLo, hregAMD64_RAX()));
2522 addInstr(env, AMD64Instr_Div(syned, 8, rmRight));
2523 addInstr(env, mk_iMOVsd_RR(hregAMD64_RDX(), tHi));
2524 addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(), tLo));
2525 *rHi = tHi;
2526 *rLo = tLo;
2527 return;
2528 }
2529
2530 /* 64HLto128(e1,e2) */
2531 case Iop_64HLto128:
2532 *rHi = iselIntExpr_R(env, e->Iex.Binop.arg1);
2533 *rLo = iselIntExpr_R(env, e->Iex.Binop.arg2);
2534 return;
2535
2536 default:
2537 break;
2538 }
2539 } /* if (e->tag == Iex_Binop) */
2540
2541 ppIRExpr(e);
2542 vpanic("iselInt128Expr");
2543 }
2544
2545
2546 /*---------------------------------------------------------*/
2547 /*--- ISEL: Floating point expressions (32 bit) ---*/
2548 /*---------------------------------------------------------*/
2549
2550 /* Nothing interesting here; really just wrappers for
2551 64-bit stuff. */
2552
iselFltExpr(ISelEnv * env,IRExpr * e)2553 static HReg iselFltExpr ( ISelEnv* env, IRExpr* e )
2554 {
2555 HReg r = iselFltExpr_wrk( env, e );
2556 # if 0
2557 vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
2558 # endif
2559 vassert(hregClass(r) == HRcVec128);
2560 vassert(hregIsVirtual(r));
2561 return r;
2562 }
2563
2564 /* DO NOT CALL THIS DIRECTLY */
iselFltExpr_wrk(ISelEnv * env,IRExpr * e)2565 static HReg iselFltExpr_wrk ( ISelEnv* env, IRExpr* e )
2566 {
2567 IRType ty = typeOfIRExpr(env->type_env,e);
2568 vassert(ty == Ity_F32);
2569
2570 if (e->tag == Iex_RdTmp) {
2571 return lookupIRTemp(env, e->Iex.RdTmp.tmp);
2572 }
2573
2574 if (e->tag == Iex_Load && e->Iex.Load.end == Iend_LE) {
2575 AMD64AMode* am;
2576 HReg res = newVRegV(env);
2577 vassert(e->Iex.Load.ty == Ity_F32);
2578 am = iselIntExpr_AMode(env, e->Iex.Load.addr);
2579 addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 4, res, am));
2580 return res;
2581 }
2582
2583 if (e->tag == Iex_Binop
2584 && e->Iex.Binop.op == Iop_F64toF32) {
2585 /* Although the result is still held in a standard SSE register,
2586 we need to round it to reflect the loss of accuracy/range
2587 entailed in casting it to a 32-bit float. */
2588 HReg dst = newVRegV(env);
2589 HReg src = iselDblExpr(env, e->Iex.Binop.arg2);
2590 set_SSE_rounding_mode( env, e->Iex.Binop.arg1 );
2591 addInstr(env, AMD64Instr_SseSDSS(True/*D->S*/,src,dst));
2592 set_SSE_rounding_default( env );
2593 return dst;
2594 }
2595
2596 if (e->tag == Iex_Get) {
2597 AMD64AMode* am = AMD64AMode_IR( e->Iex.Get.offset,
2598 hregAMD64_RBP() );
2599 HReg res = newVRegV(env);
2600 addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 4, res, am ));
2601 return res;
2602 }
2603
2604 if (e->tag == Iex_Unop
2605 && e->Iex.Unop.op == Iop_ReinterpI32asF32) {
2606 /* Given an I32, produce an IEEE754 float with the same bit
2607 pattern. */
2608 HReg dst = newVRegV(env);
2609 HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
2610 AMD64AMode* m4_rsp = AMD64AMode_IR(-4, hregAMD64_RSP());
2611 addInstr(env, AMD64Instr_Store(4, src, m4_rsp));
2612 addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 4, dst, m4_rsp ));
2613 return dst;
2614 }
2615
2616 if (e->tag == Iex_Binop && e->Iex.Binop.op == Iop_RoundF32toInt) {
2617 AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
2618 HReg arg = iselFltExpr(env, e->Iex.Binop.arg2);
2619 HReg dst = newVRegV(env);
2620
2621 /* rf now holds the value to be rounded. The first thing to do
2622 is set the FPU's rounding mode accordingly. */
2623
2624 /* Set host x87 rounding mode */
2625 set_FPU_rounding_mode( env, e->Iex.Binop.arg1 );
2626
2627 addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 4, arg, m8_rsp));
2628 addInstr(env, AMD64Instr_A87Free(1));
2629 addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 4));
2630 addInstr(env, AMD64Instr_A87FpOp(Afp_ROUND));
2631 addInstr(env, AMD64Instr_A87PushPop(m8_rsp, False/*pop*/, 4));
2632 addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 4, dst, m8_rsp));
2633
2634 /* Restore default x87 rounding. */
2635 set_FPU_rounding_default( env );
2636
2637 return dst;
2638 }
2639
2640 if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_NegF32) {
2641 /* Sigh ... very rough code. Could do much better. */
2642 /* Get the 128-bit literal 00---0 10---0 into a register
2643 and xor it with the value to be negated. */
2644 HReg r1 = newVRegI(env);
2645 HReg dst = newVRegV(env);
2646 HReg tmp = newVRegV(env);
2647 HReg src = iselFltExpr(env, e->Iex.Unop.arg);
2648 AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP());
2649 addInstr(env, mk_vMOVsd_RR(src,tmp));
2650 addInstr(env, AMD64Instr_Push(AMD64RMI_Imm(0)));
2651 addInstr(env, AMD64Instr_Imm64( 1ULL<<31, r1 ));
2652 addInstr(env, AMD64Instr_Push(AMD64RMI_Reg(r1)));
2653 addInstr(env, AMD64Instr_SseLdSt(True, 16, dst, rsp0));
2654 addInstr(env, AMD64Instr_SseReRg(Asse_XOR, tmp, dst));
2655 add_to_rsp(env, 16);
2656 return dst;
2657 }
2658
2659 if (e->tag == Iex_Qop && e->Iex.Qop.details->op == Iop_MAddF32) {
2660 IRQop *qop = e->Iex.Qop.details;
2661 HReg dst = newVRegV(env);
2662 HReg argX = iselFltExpr(env, qop->arg2);
2663 HReg argY = iselFltExpr(env, qop->arg3);
2664 HReg argZ = iselFltExpr(env, qop->arg4);
2665 /* XXXROUNDINGFIXME */
2666 /* set roundingmode here */
2667 /* subq $16, %rsp -- make a space*/
2668 sub_from_rsp(env, 16);
2669 /* Prepare 4 arg regs:
2670 leaq 0(%rsp), %rdi
2671 leaq 4(%rsp), %rsi
2672 leaq 8(%rsp), %rdx
2673 leaq 12(%rsp), %rcx
2674 */
2675 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, hregAMD64_RSP()),
2676 hregAMD64_RDI()));
2677 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(4, hregAMD64_RSP()),
2678 hregAMD64_RSI()));
2679 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(8, hregAMD64_RSP()),
2680 hregAMD64_RDX()));
2681 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(12, hregAMD64_RSP()),
2682 hregAMD64_RCX()));
2683 /* Store the three args, at (%rsi), (%rdx) and (%rcx):
2684 movss %argX, 0(%rsi)
2685 movss %argY, 0(%rdx)
2686 movss %argZ, 0(%rcx)
2687 */
2688 addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 4, argX,
2689 AMD64AMode_IR(0, hregAMD64_RSI())));
2690 addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 4, argY,
2691 AMD64AMode_IR(0, hregAMD64_RDX())));
2692 addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 4, argZ,
2693 AMD64AMode_IR(0, hregAMD64_RCX())));
2694 /* call the helper */
2695 addInstr(env, AMD64Instr_Call( Acc_ALWAYS,
2696 (ULong)(HWord)h_generic_calc_MAddF32,
2697 4, mk_RetLoc_simple(RLPri_None) ));
2698 /* fetch the result from memory, using %r_argp, which the
2699 register allocator will keep alive across the call. */
2700 addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 4, dst,
2701 AMD64AMode_IR(0, hregAMD64_RSP())));
2702 /* and finally, clear the space */
2703 add_to_rsp(env, 16);
2704 return dst;
2705 }
2706
2707 ppIRExpr(e);
2708 vpanic("iselFltExpr_wrk");
2709 }
2710
2711
2712 /*---------------------------------------------------------*/
2713 /*--- ISEL: Floating point expressions (64 bit) ---*/
2714 /*---------------------------------------------------------*/
2715
2716 /* Compute a 64-bit floating point value into the lower half of an xmm
2717 register, the identity of which is returned. As with
2718 iselIntExpr_R, the returned reg will be virtual, and it must not be
2719 changed by subsequent code emitted by the caller.
2720 */
2721
2722 /* IEEE 754 formats. From http://www.freesoft.org/CIE/RFC/1832/32.htm:
2723
2724 Type S (1 bit) E (11 bits) F (52 bits)
2725 ---- --------- ----------- -----------
2726 signalling NaN u 2047 (max) .0uuuuu---u
2727 (with at least
2728 one 1 bit)
2729 quiet NaN u 2047 (max) .1uuuuu---u
2730
2731 negative infinity 1 2047 (max) .000000---0
2732
2733 positive infinity 0 2047 (max) .000000---0
2734
2735 negative zero 1 0 .000000---0
2736
2737 positive zero 0 0 .000000---0
2738 */
2739
iselDblExpr(ISelEnv * env,IRExpr * e)2740 static HReg iselDblExpr ( ISelEnv* env, IRExpr* e )
2741 {
2742 HReg r = iselDblExpr_wrk( env, e );
2743 # if 0
2744 vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
2745 # endif
2746 vassert(hregClass(r) == HRcVec128);
2747 vassert(hregIsVirtual(r));
2748 return r;
2749 }
2750
2751 /* DO NOT CALL THIS DIRECTLY */
iselDblExpr_wrk(ISelEnv * env,IRExpr * e)2752 static HReg iselDblExpr_wrk ( ISelEnv* env, IRExpr* e )
2753 {
2754 IRType ty = typeOfIRExpr(env->type_env,e);
2755 vassert(e);
2756 vassert(ty == Ity_F64);
2757
2758 if (e->tag == Iex_RdTmp) {
2759 return lookupIRTemp(env, e->Iex.RdTmp.tmp);
2760 }
2761
2762 if (e->tag == Iex_Const) {
2763 union { ULong u64; Double f64; } u;
2764 HReg res = newVRegV(env);
2765 HReg tmp = newVRegI(env);
2766 vassert(sizeof(u) == 8);
2767 vassert(sizeof(u.u64) == 8);
2768 vassert(sizeof(u.f64) == 8);
2769
2770 if (e->Iex.Const.con->tag == Ico_F64) {
2771 u.f64 = e->Iex.Const.con->Ico.F64;
2772 }
2773 else if (e->Iex.Const.con->tag == Ico_F64i) {
2774 u.u64 = e->Iex.Const.con->Ico.F64i;
2775 }
2776 else
2777 vpanic("iselDblExpr(amd64): const");
2778
2779 addInstr(env, AMD64Instr_Imm64(u.u64, tmp));
2780 addInstr(env, AMD64Instr_Push(AMD64RMI_Reg(tmp)));
2781 addInstr(env, AMD64Instr_SseLdSt(
2782 True/*load*/, 8, res,
2783 AMD64AMode_IR(0, hregAMD64_RSP())
2784 ));
2785 add_to_rsp(env, 8);
2786 return res;
2787 }
2788
2789 if (e->tag == Iex_Load && e->Iex.Load.end == Iend_LE) {
2790 AMD64AMode* am;
2791 HReg res = newVRegV(env);
2792 vassert(e->Iex.Load.ty == Ity_F64);
2793 am = iselIntExpr_AMode(env, e->Iex.Load.addr);
2794 addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 8, res, am ));
2795 return res;
2796 }
2797
2798 if (e->tag == Iex_Get) {
2799 AMD64AMode* am = AMD64AMode_IR( e->Iex.Get.offset,
2800 hregAMD64_RBP() );
2801 HReg res = newVRegV(env);
2802 addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 8, res, am ));
2803 return res;
2804 }
2805
2806 if (e->tag == Iex_GetI) {
2807 AMD64AMode* am
2808 = genGuestArrayOffset(
2809 env, e->Iex.GetI.descr,
2810 e->Iex.GetI.ix, e->Iex.GetI.bias );
2811 HReg res = newVRegV(env);
2812 addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 8, res, am ));
2813 return res;
2814 }
2815
2816 if (e->tag == Iex_Triop) {
2817 IRTriop *triop = e->Iex.Triop.details;
2818 AMD64SseOp op = Asse_INVALID;
2819 switch (triop->op) {
2820 case Iop_AddF64: op = Asse_ADDF; break;
2821 case Iop_SubF64: op = Asse_SUBF; break;
2822 case Iop_MulF64: op = Asse_MULF; break;
2823 case Iop_DivF64: op = Asse_DIVF; break;
2824 default: break;
2825 }
2826 if (op != Asse_INVALID) {
2827 HReg dst = newVRegV(env);
2828 HReg argL = iselDblExpr(env, triop->arg2);
2829 HReg argR = iselDblExpr(env, triop->arg3);
2830 addInstr(env, mk_vMOVsd_RR(argL, dst));
2831 /* XXXROUNDINGFIXME */
2832 /* set roundingmode here */
2833 addInstr(env, AMD64Instr_Sse64FLo(op, argR, dst));
2834 return dst;
2835 }
2836 }
2837
2838 if (e->tag == Iex_Qop && e->Iex.Qop.details->op == Iop_MAddF64) {
2839 IRQop *qop = e->Iex.Qop.details;
2840 HReg dst = newVRegV(env);
2841 HReg argX = iselDblExpr(env, qop->arg2);
2842 HReg argY = iselDblExpr(env, qop->arg3);
2843 HReg argZ = iselDblExpr(env, qop->arg4);
2844 /* XXXROUNDINGFIXME */
2845 /* set roundingmode here */
2846 /* subq $32, %rsp -- make a space*/
2847 sub_from_rsp(env, 32);
2848 /* Prepare 4 arg regs:
2849 leaq 0(%rsp), %rdi
2850 leaq 8(%rsp), %rsi
2851 leaq 16(%rsp), %rdx
2852 leaq 24(%rsp), %rcx
2853 */
2854 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, hregAMD64_RSP()),
2855 hregAMD64_RDI()));
2856 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(8, hregAMD64_RSP()),
2857 hregAMD64_RSI()));
2858 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(16, hregAMD64_RSP()),
2859 hregAMD64_RDX()));
2860 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(24, hregAMD64_RSP()),
2861 hregAMD64_RCX()));
2862 /* Store the three args, at (%rsi), (%rdx) and (%rcx):
2863 movsd %argX, 0(%rsi)
2864 movsd %argY, 0(%rdx)
2865 movsd %argZ, 0(%rcx)
2866 */
2867 addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 8, argX,
2868 AMD64AMode_IR(0, hregAMD64_RSI())));
2869 addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 8, argY,
2870 AMD64AMode_IR(0, hregAMD64_RDX())));
2871 addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 8, argZ,
2872 AMD64AMode_IR(0, hregAMD64_RCX())));
2873 /* call the helper */
2874 addInstr(env, AMD64Instr_Call( Acc_ALWAYS,
2875 (ULong)(HWord)h_generic_calc_MAddF64,
2876 4, mk_RetLoc_simple(RLPri_None) ));
2877 /* fetch the result from memory, using %r_argp, which the
2878 register allocator will keep alive across the call. */
2879 addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 8, dst,
2880 AMD64AMode_IR(0, hregAMD64_RSP())));
2881 /* and finally, clear the space */
2882 add_to_rsp(env, 32);
2883 return dst;
2884 }
2885
2886 if (e->tag == Iex_Binop && e->Iex.Binop.op == Iop_RoundF64toInt) {
2887 AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
2888 HReg arg = iselDblExpr(env, e->Iex.Binop.arg2);
2889 HReg dst = newVRegV(env);
2890
2891 /* rf now holds the value to be rounded. The first thing to do
2892 is set the FPU's rounding mode accordingly. */
2893
2894 /* Set host x87 rounding mode */
2895 set_FPU_rounding_mode( env, e->Iex.Binop.arg1 );
2896
2897 addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, arg, m8_rsp));
2898 addInstr(env, AMD64Instr_A87Free(1));
2899 addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
2900 addInstr(env, AMD64Instr_A87FpOp(Afp_ROUND));
2901 addInstr(env, AMD64Instr_A87PushPop(m8_rsp, False/*pop*/, 8));
2902 addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 8, dst, m8_rsp));
2903
2904 /* Restore default x87 rounding. */
2905 set_FPU_rounding_default( env );
2906
2907 return dst;
2908 }
2909
2910 IRTriop *triop = e->Iex.Triop.details;
2911 if (e->tag == Iex_Triop
2912 && (triop->op == Iop_ScaleF64
2913 || triop->op == Iop_AtanF64
2914 || triop->op == Iop_Yl2xF64
2915 || triop->op == Iop_Yl2xp1F64
2916 || triop->op == Iop_PRemF64
2917 || triop->op == Iop_PRem1F64)
2918 ) {
2919 AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
2920 HReg arg1 = iselDblExpr(env, triop->arg2);
2921 HReg arg2 = iselDblExpr(env, triop->arg3);
2922 HReg dst = newVRegV(env);
2923 Bool arg2first = toBool(triop->op == Iop_ScaleF64
2924 || triop->op == Iop_PRemF64
2925 || triop->op == Iop_PRem1F64);
2926 addInstr(env, AMD64Instr_A87Free(2));
2927
2928 /* one arg -> top of x87 stack */
2929 addInstr(env, AMD64Instr_SseLdSt(
2930 False/*store*/, 8, arg2first ? arg2 : arg1, m8_rsp));
2931 addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
2932
2933 /* other arg -> top of x87 stack */
2934 addInstr(env, AMD64Instr_SseLdSt(
2935 False/*store*/, 8, arg2first ? arg1 : arg2, m8_rsp));
2936 addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
2937
2938 /* do it */
2939 /* XXXROUNDINGFIXME */
2940 /* set roundingmode here */
2941 switch (triop->op) {
2942 case Iop_ScaleF64:
2943 addInstr(env, AMD64Instr_A87FpOp(Afp_SCALE));
2944 break;
2945 case Iop_AtanF64:
2946 addInstr(env, AMD64Instr_A87FpOp(Afp_ATAN));
2947 break;
2948 case Iop_Yl2xF64:
2949 addInstr(env, AMD64Instr_A87FpOp(Afp_YL2X));
2950 break;
2951 case Iop_Yl2xp1F64:
2952 addInstr(env, AMD64Instr_A87FpOp(Afp_YL2XP1));
2953 break;
2954 case Iop_PRemF64:
2955 addInstr(env, AMD64Instr_A87FpOp(Afp_PREM));
2956 break;
2957 case Iop_PRem1F64:
2958 addInstr(env, AMD64Instr_A87FpOp(Afp_PREM1));
2959 break;
2960 default:
2961 vassert(0);
2962 }
2963
2964 /* save result */
2965 addInstr(env, AMD64Instr_A87PushPop(m8_rsp, False/*pop*/, 8));
2966 addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 8, dst, m8_rsp));
2967 return dst;
2968 }
2969
2970 if (e->tag == Iex_Binop && e->Iex.Binop.op == Iop_I64StoF64) {
2971 HReg dst = newVRegV(env);
2972 HReg src = iselIntExpr_R(env, e->Iex.Binop.arg2);
2973 set_SSE_rounding_mode( env, e->Iex.Binop.arg1 );
2974 addInstr(env, AMD64Instr_SseSI2SF( 8, 8, src, dst ));
2975 set_SSE_rounding_default( env );
2976 return dst;
2977 }
2978
2979 if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_I32StoF64) {
2980 HReg dst = newVRegV(env);
2981 HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
2982 set_SSE_rounding_default( env );
2983 addInstr(env, AMD64Instr_SseSI2SF( 4, 8, src, dst ));
2984 return dst;
2985 }
2986
2987 if (e->tag == Iex_Unop
2988 && (e->Iex.Unop.op == Iop_NegF64
2989 || e->Iex.Unop.op == Iop_AbsF64)) {
2990 /* Sigh ... very rough code. Could do much better. */
2991 /* Get the 128-bit literal 00---0 10---0 into a register
2992 and xor/nand it with the value to be negated. */
2993 HReg r1 = newVRegI(env);
2994 HReg dst = newVRegV(env);
2995 HReg tmp = newVRegV(env);
2996 HReg src = iselDblExpr(env, e->Iex.Unop.arg);
2997 AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP());
2998 addInstr(env, mk_vMOVsd_RR(src,tmp));
2999 addInstr(env, AMD64Instr_Push(AMD64RMI_Imm(0)));
3000 addInstr(env, AMD64Instr_Imm64( 1ULL<<63, r1 ));
3001 addInstr(env, AMD64Instr_Push(AMD64RMI_Reg(r1)));
3002 addInstr(env, AMD64Instr_SseLdSt(True, 16, dst, rsp0));
3003
3004 if (e->Iex.Unop.op == Iop_NegF64)
3005 addInstr(env, AMD64Instr_SseReRg(Asse_XOR, tmp, dst));
3006 else
3007 addInstr(env, AMD64Instr_SseReRg(Asse_ANDN, tmp, dst));
3008
3009 add_to_rsp(env, 16);
3010 return dst;
3011 }
3012
3013 if (e->tag == Iex_Binop) {
3014 A87FpOp fpop = Afp_INVALID;
3015 switch (e->Iex.Binop.op) {
3016 case Iop_SqrtF64: fpop = Afp_SQRT; break;
3017 case Iop_SinF64: fpop = Afp_SIN; break;
3018 case Iop_CosF64: fpop = Afp_COS; break;
3019 case Iop_TanF64: fpop = Afp_TAN; break;
3020 case Iop_2xm1F64: fpop = Afp_2XM1; break;
3021 default: break;
3022 }
3023 if (fpop != Afp_INVALID) {
3024 AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
3025 HReg arg = iselDblExpr(env, e->Iex.Binop.arg2);
3026 HReg dst = newVRegV(env);
3027 Int nNeeded = e->Iex.Binop.op==Iop_TanF64 ? 2 : 1;
3028 addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, arg, m8_rsp));
3029 addInstr(env, AMD64Instr_A87Free(nNeeded));
3030 addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
3031 /* XXXROUNDINGFIXME */
3032 /* set roundingmode here */
3033 /* Note that AMD64Instr_A87FpOp(Afp_TAN) sets the condition
3034 codes. I don't think that matters, since this insn
3035 selector never generates such an instruction intervening
3036 between an flag-setting instruction and a flag-using
3037 instruction. */
3038 addInstr(env, AMD64Instr_A87FpOp(fpop));
3039 addInstr(env, AMD64Instr_A87PushPop(m8_rsp, False/*pop*/, 8));
3040 addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 8, dst, m8_rsp));
3041 return dst;
3042 }
3043 }
3044
3045 if (e->tag == Iex_Unop) {
3046 switch (e->Iex.Unop.op) {
3047 //.. case Iop_I32toF64: {
3048 //.. HReg dst = newVRegF(env);
3049 //.. HReg ri = iselIntExpr_R(env, e->Iex.Unop.arg);
3050 //.. addInstr(env, X86Instr_Push(X86RMI_Reg(ri)));
3051 //.. set_FPU_rounding_default(env);
3052 //.. addInstr(env, X86Instr_FpLdStI(
3053 //.. True/*load*/, 4, dst,
3054 //.. X86AMode_IR(0, hregX86_ESP())));
3055 //.. add_to_esp(env, 4);
3056 //.. return dst;
3057 //.. }
3058 case Iop_ReinterpI64asF64: {
3059 /* Given an I64, produce an IEEE754 double with the same
3060 bit pattern. */
3061 AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
3062 HReg dst = newVRegV(env);
3063 AMD64RI* src = iselIntExpr_RI(env, e->Iex.Unop.arg);
3064 /* paranoia */
3065 set_SSE_rounding_default(env);
3066 addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, src, m8_rsp));
3067 addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 8, dst, m8_rsp));
3068 return dst;
3069 }
3070 case Iop_F32toF64: {
3071 HReg f32;
3072 HReg f64 = newVRegV(env);
3073 /* this shouldn't be necessary, but be paranoid ... */
3074 set_SSE_rounding_default(env);
3075 f32 = iselFltExpr(env, e->Iex.Unop.arg);
3076 addInstr(env, AMD64Instr_SseSDSS(False/*S->D*/, f32, f64));
3077 return f64;
3078 }
3079 default:
3080 break;
3081 }
3082 }
3083
3084 /* --------- MULTIPLEX --------- */
3085 if (e->tag == Iex_ITE) { // VFD
3086 HReg r1, r0, dst;
3087 vassert(ty == Ity_F64);
3088 vassert(typeOfIRExpr(env->type_env,e->Iex.ITE.cond) == Ity_I1);
3089 r1 = iselDblExpr(env, e->Iex.ITE.iftrue);
3090 r0 = iselDblExpr(env, e->Iex.ITE.iffalse);
3091 dst = newVRegV(env);
3092 addInstr(env, mk_vMOVsd_RR(r1,dst));
3093 AMD64CondCode cc = iselCondCode(env, e->Iex.ITE.cond);
3094 addInstr(env, AMD64Instr_SseCMov(cc ^ 1, r0, dst));
3095 return dst;
3096 }
3097
3098 ppIRExpr(e);
3099 vpanic("iselDblExpr_wrk");
3100 }
3101
3102
3103 /*---------------------------------------------------------*/
3104 /*--- ISEL: SIMD (Vector) expressions, 128 bit. ---*/
3105 /*---------------------------------------------------------*/
3106
iselVecExpr(ISelEnv * env,IRExpr * e)3107 static HReg iselVecExpr ( ISelEnv* env, IRExpr* e )
3108 {
3109 HReg r = iselVecExpr_wrk( env, e );
3110 # if 0
3111 vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
3112 # endif
3113 vassert(hregClass(r) == HRcVec128);
3114 vassert(hregIsVirtual(r));
3115 return r;
3116 }
3117
3118
3119 /* DO NOT CALL THIS DIRECTLY */
iselVecExpr_wrk(ISelEnv * env,IRExpr * e)3120 static HReg iselVecExpr_wrk ( ISelEnv* env, IRExpr* e )
3121 {
3122 HWord fn = 0; /* address of helper fn, if required */
3123 Bool arg1isEReg = False;
3124 AMD64SseOp op = Asse_INVALID;
3125 IRType ty = typeOfIRExpr(env->type_env,e);
3126 vassert(e);
3127 vassert(ty == Ity_V128);
3128
3129 if (e->tag == Iex_RdTmp) {
3130 return lookupIRTemp(env, e->Iex.RdTmp.tmp);
3131 }
3132
3133 if (e->tag == Iex_Get) {
3134 HReg dst = newVRegV(env);
3135 addInstr(env, AMD64Instr_SseLdSt(
3136 True/*load*/,
3137 16,
3138 dst,
3139 AMD64AMode_IR(e->Iex.Get.offset, hregAMD64_RBP())
3140 )
3141 );
3142 return dst;
3143 }
3144
3145 if (e->tag == Iex_Load && e->Iex.Load.end == Iend_LE) {
3146 HReg dst = newVRegV(env);
3147 AMD64AMode* am = iselIntExpr_AMode(env, e->Iex.Load.addr);
3148 addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 16, dst, am ));
3149 return dst;
3150 }
3151
3152 if (e->tag == Iex_Const) {
3153 HReg dst = newVRegV(env);
3154 vassert(e->Iex.Const.con->tag == Ico_V128);
3155 switch (e->Iex.Const.con->Ico.V128) {
3156 case 0x0000:
3157 dst = generate_zeroes_V128(env);
3158 break;
3159 case 0xFFFF:
3160 dst = generate_ones_V128(env);
3161 break;
3162 default: {
3163 AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP());
3164 /* do push_uimm64 twice, first time for the high-order half. */
3165 push_uimm64(env, bitmask8_to_bytemask64(
3166 (e->Iex.Const.con->Ico.V128 >> 8) & 0xFF
3167 ));
3168 push_uimm64(env, bitmask8_to_bytemask64(
3169 (e->Iex.Const.con->Ico.V128 >> 0) & 0xFF
3170 ));
3171 addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 16, dst, rsp0 ));
3172 add_to_rsp(env, 16);
3173 break;
3174 }
3175 }
3176 return dst;
3177 }
3178
3179 if (e->tag == Iex_Unop) {
3180 switch (e->Iex.Unop.op) {
3181
3182 case Iop_NotV128: {
3183 HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
3184 return do_sse_NotV128(env, arg);
3185 }
3186
3187 case Iop_CmpNEZ64x2: {
3188 /* We can use SSE2 instructions for this. */
3189 /* Ideally, we want to do a 64Ix2 comparison against zero of
3190 the operand. Problem is no such insn exists. Solution
3191 therefore is to do a 32Ix4 comparison instead, and bitwise-
3192 negate (NOT) the result. Let a,b,c,d be 32-bit lanes, and
3193 let the not'd result of this initial comparison be a:b:c:d.
3194 What we need to compute is (a|b):(a|b):(c|d):(c|d). So, use
3195 pshufd to create a value b:a:d:c, and OR that with a:b:c:d,
3196 giving the required result.
3197
3198 The required selection sequence is 2,3,0,1, which
3199 according to Intel's documentation means the pshufd
3200 literal value is 0xB1, that is,
3201 (2 << 6) | (3 << 4) | (0 << 2) | (1 << 0)
3202 */
3203 HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
3204 HReg tmp = generate_zeroes_V128(env);
3205 HReg dst = newVRegV(env);
3206 addInstr(env, AMD64Instr_SseReRg(Asse_CMPEQ32, arg, tmp));
3207 tmp = do_sse_NotV128(env, tmp);
3208 addInstr(env, AMD64Instr_SseShuf(0xB1, tmp, dst));
3209 addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmp, dst));
3210 return dst;
3211 }
3212
3213 case Iop_CmpNEZ32x4: op = Asse_CMPEQ32; goto do_CmpNEZ_vector;
3214 case Iop_CmpNEZ16x8: op = Asse_CMPEQ16; goto do_CmpNEZ_vector;
3215 case Iop_CmpNEZ8x16: op = Asse_CMPEQ8; goto do_CmpNEZ_vector;
3216 do_CmpNEZ_vector:
3217 {
3218 HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
3219 HReg tmp = newVRegV(env);
3220 HReg zero = generate_zeroes_V128(env);
3221 HReg dst;
3222 addInstr(env, mk_vMOVsd_RR(arg, tmp));
3223 addInstr(env, AMD64Instr_SseReRg(op, zero, tmp));
3224 dst = do_sse_NotV128(env, tmp);
3225 return dst;
3226 }
3227
3228 case Iop_RecipEst32Fx4: op = Asse_RCPF; goto do_32Fx4_unary;
3229 case Iop_RSqrtEst32Fx4: op = Asse_RSQRTF; goto do_32Fx4_unary;
3230 do_32Fx4_unary:
3231 {
3232 HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
3233 HReg dst = newVRegV(env);
3234 addInstr(env, AMD64Instr_Sse32Fx4(op, arg, dst));
3235 return dst;
3236 }
3237
3238 case Iop_RecipEst32F0x4: op = Asse_RCPF; goto do_32F0x4_unary;
3239 case Iop_RSqrtEst32F0x4: op = Asse_RSQRTF; goto do_32F0x4_unary;
3240 case Iop_Sqrt32F0x4: op = Asse_SQRTF; goto do_32F0x4_unary;
3241 do_32F0x4_unary:
3242 {
3243 /* A bit subtle. We have to copy the arg to the result
3244 register first, because actually doing the SSE scalar insn
3245 leaves the upper 3/4 of the destination register
3246 unchanged. Whereas the required semantics of these
3247 primops is that the upper 3/4 is simply copied in from the
3248 argument. */
3249 HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
3250 HReg dst = newVRegV(env);
3251 addInstr(env, mk_vMOVsd_RR(arg, dst));
3252 addInstr(env, AMD64Instr_Sse32FLo(op, arg, dst));
3253 return dst;
3254 }
3255
3256 case Iop_Sqrt64F0x2: op = Asse_SQRTF; goto do_64F0x2_unary;
3257 do_64F0x2_unary:
3258 {
3259 /* A bit subtle. We have to copy the arg to the result
3260 register first, because actually doing the SSE scalar insn
3261 leaves the upper half of the destination register
3262 unchanged. Whereas the required semantics of these
3263 primops is that the upper half is simply copied in from the
3264 argument. */
3265 HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
3266 HReg dst = newVRegV(env);
3267 addInstr(env, mk_vMOVsd_RR(arg, dst));
3268 addInstr(env, AMD64Instr_Sse64FLo(op, arg, dst));
3269 return dst;
3270 }
3271
3272 case Iop_32UtoV128: {
3273 HReg dst = newVRegV(env);
3274 AMD64AMode* rsp_m32 = AMD64AMode_IR(-32, hregAMD64_RSP());
3275 AMD64RI* ri = iselIntExpr_RI(env, e->Iex.Unop.arg);
3276 addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, ri, rsp_m32));
3277 addInstr(env, AMD64Instr_SseLdzLO(4, dst, rsp_m32));
3278 return dst;
3279 }
3280
3281 case Iop_64UtoV128: {
3282 HReg dst = newVRegV(env);
3283 AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP());
3284 AMD64RMI* rmi = iselIntExpr_RMI(env, e->Iex.Unop.arg);
3285 addInstr(env, AMD64Instr_Push(rmi));
3286 addInstr(env, AMD64Instr_SseLdzLO(8, dst, rsp0));
3287 add_to_rsp(env, 8);
3288 return dst;
3289 }
3290
3291 case Iop_V256toV128_0:
3292 case Iop_V256toV128_1: {
3293 HReg vHi, vLo;
3294 iselDVecExpr(&vHi, &vLo, env, e->Iex.Unop.arg);
3295 return (e->Iex.Unop.op == Iop_V256toV128_1) ? vHi : vLo;
3296 }
3297
3298 default:
3299 break;
3300 } /* switch (e->Iex.Unop.op) */
3301 } /* if (e->tag == Iex_Unop) */
3302
3303 if (e->tag == Iex_Binop) {
3304 switch (e->Iex.Binop.op) {
3305
3306 case Iop_Sqrt64Fx2:
3307 case Iop_Sqrt32Fx4: {
3308 /* :: (rmode, vec) -> vec */
3309 HReg arg = iselVecExpr(env, e->Iex.Binop.arg2);
3310 HReg dst = newVRegV(env);
3311 /* XXXROUNDINGFIXME */
3312 /* set roundingmode here */
3313 addInstr(env, (e->Iex.Binop.op == Iop_Sqrt64Fx2
3314 ? AMD64Instr_Sse64Fx2 : AMD64Instr_Sse32Fx4)
3315 (Asse_SQRTF, arg, dst));
3316 return dst;
3317 }
3318
3319 /* FIXME: could we generate MOVQ here? */
3320 case Iop_SetV128lo64: {
3321 HReg dst = newVRegV(env);
3322 HReg srcV = iselVecExpr(env, e->Iex.Binop.arg1);
3323 HReg srcI = iselIntExpr_R(env, e->Iex.Binop.arg2);
3324 AMD64AMode* rsp_m16 = AMD64AMode_IR(-16, hregAMD64_RSP());
3325 addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, srcV, rsp_m16));
3326 addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, AMD64RI_Reg(srcI), rsp_m16));
3327 addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, dst, rsp_m16));
3328 return dst;
3329 }
3330
3331 /* FIXME: could we generate MOVD here? */
3332 case Iop_SetV128lo32: {
3333 HReg dst = newVRegV(env);
3334 HReg srcV = iselVecExpr(env, e->Iex.Binop.arg1);
3335 HReg srcI = iselIntExpr_R(env, e->Iex.Binop.arg2);
3336 AMD64AMode* rsp_m16 = AMD64AMode_IR(-16, hregAMD64_RSP());
3337 addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, srcV, rsp_m16));
3338 addInstr(env, AMD64Instr_Store(4, srcI, rsp_m16));
3339 addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, dst, rsp_m16));
3340 return dst;
3341 }
3342
3343 case Iop_64HLtoV128: {
3344 HReg rsp = hregAMD64_RSP();
3345 AMD64AMode* m8_rsp = AMD64AMode_IR(-8, rsp);
3346 AMD64AMode* m16_rsp = AMD64AMode_IR(-16, rsp);
3347 AMD64RI* qHi = iselIntExpr_RI(env, e->Iex.Binop.arg1);
3348 AMD64RI* qLo = iselIntExpr_RI(env, e->Iex.Binop.arg2);
3349 addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, qHi, m8_rsp));
3350 addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, qLo, m16_rsp));
3351 HReg dst = newVRegV(env);
3352 /* One store-forwarding stall coming up, oh well :-( */
3353 addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, dst, m16_rsp));
3354 return dst;
3355 }
3356
3357 case Iop_CmpEQ32Fx4: op = Asse_CMPEQF; goto do_32Fx4;
3358 case Iop_CmpLT32Fx4: op = Asse_CMPLTF; goto do_32Fx4;
3359 case Iop_CmpLE32Fx4: op = Asse_CMPLEF; goto do_32Fx4;
3360 case Iop_CmpUN32Fx4: op = Asse_CMPUNF; goto do_32Fx4;
3361 case Iop_Max32Fx4: op = Asse_MAXF; goto do_32Fx4;
3362 case Iop_Min32Fx4: op = Asse_MINF; goto do_32Fx4;
3363 do_32Fx4:
3364 {
3365 HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
3366 HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
3367 HReg dst = newVRegV(env);
3368 addInstr(env, mk_vMOVsd_RR(argL, dst));
3369 addInstr(env, AMD64Instr_Sse32Fx4(op, argR, dst));
3370 return dst;
3371 }
3372
3373 case Iop_CmpEQ64Fx2: op = Asse_CMPEQF; goto do_64Fx2;
3374 case Iop_CmpLT64Fx2: op = Asse_CMPLTF; goto do_64Fx2;
3375 case Iop_CmpLE64Fx2: op = Asse_CMPLEF; goto do_64Fx2;
3376 case Iop_CmpUN64Fx2: op = Asse_CMPUNF; goto do_64Fx2;
3377 case Iop_Max64Fx2: op = Asse_MAXF; goto do_64Fx2;
3378 case Iop_Min64Fx2: op = Asse_MINF; goto do_64Fx2;
3379 do_64Fx2:
3380 {
3381 HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
3382 HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
3383 HReg dst = newVRegV(env);
3384 addInstr(env, mk_vMOVsd_RR(argL, dst));
3385 addInstr(env, AMD64Instr_Sse64Fx2(op, argR, dst));
3386 return dst;
3387 }
3388
3389 case Iop_CmpEQ32F0x4: op = Asse_CMPEQF; goto do_32F0x4;
3390 case Iop_CmpLT32F0x4: op = Asse_CMPLTF; goto do_32F0x4;
3391 case Iop_CmpLE32F0x4: op = Asse_CMPLEF; goto do_32F0x4;
3392 case Iop_CmpUN32F0x4: op = Asse_CMPUNF; goto do_32F0x4;
3393 case Iop_Add32F0x4: op = Asse_ADDF; goto do_32F0x4;
3394 case Iop_Div32F0x4: op = Asse_DIVF; goto do_32F0x4;
3395 case Iop_Max32F0x4: op = Asse_MAXF; goto do_32F0x4;
3396 case Iop_Min32F0x4: op = Asse_MINF; goto do_32F0x4;
3397 case Iop_Mul32F0x4: op = Asse_MULF; goto do_32F0x4;
3398 case Iop_Sub32F0x4: op = Asse_SUBF; goto do_32F0x4;
3399 do_32F0x4: {
3400 HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
3401 HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
3402 HReg dst = newVRegV(env);
3403 addInstr(env, mk_vMOVsd_RR(argL, dst));
3404 addInstr(env, AMD64Instr_Sse32FLo(op, argR, dst));
3405 return dst;
3406 }
3407
3408 case Iop_CmpEQ64F0x2: op = Asse_CMPEQF; goto do_64F0x2;
3409 case Iop_CmpLT64F0x2: op = Asse_CMPLTF; goto do_64F0x2;
3410 case Iop_CmpLE64F0x2: op = Asse_CMPLEF; goto do_64F0x2;
3411 case Iop_CmpUN64F0x2: op = Asse_CMPUNF; goto do_64F0x2;
3412 case Iop_Add64F0x2: op = Asse_ADDF; goto do_64F0x2;
3413 case Iop_Div64F0x2: op = Asse_DIVF; goto do_64F0x2;
3414 case Iop_Max64F0x2: op = Asse_MAXF; goto do_64F0x2;
3415 case Iop_Min64F0x2: op = Asse_MINF; goto do_64F0x2;
3416 case Iop_Mul64F0x2: op = Asse_MULF; goto do_64F0x2;
3417 case Iop_Sub64F0x2: op = Asse_SUBF; goto do_64F0x2;
3418 do_64F0x2: {
3419 HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
3420 HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
3421 HReg dst = newVRegV(env);
3422 addInstr(env, mk_vMOVsd_RR(argL, dst));
3423 addInstr(env, AMD64Instr_Sse64FLo(op, argR, dst));
3424 return dst;
3425 }
3426
3427 case Iop_QNarrowBin32Sto16Sx8:
3428 op = Asse_PACKSSD; arg1isEReg = True; goto do_SseReRg;
3429 case Iop_QNarrowBin16Sto8Sx16:
3430 op = Asse_PACKSSW; arg1isEReg = True; goto do_SseReRg;
3431 case Iop_QNarrowBin16Sto8Ux16:
3432 op = Asse_PACKUSW; arg1isEReg = True; goto do_SseReRg;
3433
3434 case Iop_InterleaveHI8x16:
3435 op = Asse_UNPCKHB; arg1isEReg = True; goto do_SseReRg;
3436 case Iop_InterleaveHI16x8:
3437 op = Asse_UNPCKHW; arg1isEReg = True; goto do_SseReRg;
3438 case Iop_InterleaveHI32x4:
3439 op = Asse_UNPCKHD; arg1isEReg = True; goto do_SseReRg;
3440 case Iop_InterleaveHI64x2:
3441 op = Asse_UNPCKHQ; arg1isEReg = True; goto do_SseReRg;
3442
3443 case Iop_InterleaveLO8x16:
3444 op = Asse_UNPCKLB; arg1isEReg = True; goto do_SseReRg;
3445 case Iop_InterleaveLO16x8:
3446 op = Asse_UNPCKLW; arg1isEReg = True; goto do_SseReRg;
3447 case Iop_InterleaveLO32x4:
3448 op = Asse_UNPCKLD; arg1isEReg = True; goto do_SseReRg;
3449 case Iop_InterleaveLO64x2:
3450 op = Asse_UNPCKLQ; arg1isEReg = True; goto do_SseReRg;
3451
3452 case Iop_AndV128: op = Asse_AND; goto do_SseReRg;
3453 case Iop_OrV128: op = Asse_OR; goto do_SseReRg;
3454 case Iop_XorV128: op = Asse_XOR; goto do_SseReRg;
3455 case Iop_Add8x16: op = Asse_ADD8; goto do_SseReRg;
3456 case Iop_Add16x8: op = Asse_ADD16; goto do_SseReRg;
3457 case Iop_Add32x4: op = Asse_ADD32; goto do_SseReRg;
3458 case Iop_Add64x2: op = Asse_ADD64; goto do_SseReRg;
3459 case Iop_QAdd8Sx16: op = Asse_QADD8S; goto do_SseReRg;
3460 case Iop_QAdd16Sx8: op = Asse_QADD16S; goto do_SseReRg;
3461 case Iop_QAdd8Ux16: op = Asse_QADD8U; goto do_SseReRg;
3462 case Iop_QAdd16Ux8: op = Asse_QADD16U; goto do_SseReRg;
3463 case Iop_Avg8Ux16: op = Asse_AVG8U; goto do_SseReRg;
3464 case Iop_Avg16Ux8: op = Asse_AVG16U; goto do_SseReRg;
3465 case Iop_CmpEQ8x16: op = Asse_CMPEQ8; goto do_SseReRg;
3466 case Iop_CmpEQ16x8: op = Asse_CMPEQ16; goto do_SseReRg;
3467 case Iop_CmpEQ32x4: op = Asse_CMPEQ32; goto do_SseReRg;
3468 case Iop_CmpGT8Sx16: op = Asse_CMPGT8S; goto do_SseReRg;
3469 case Iop_CmpGT16Sx8: op = Asse_CMPGT16S; goto do_SseReRg;
3470 case Iop_CmpGT32Sx4: op = Asse_CMPGT32S; goto do_SseReRg;
3471 case Iop_Max16Sx8: op = Asse_MAX16S; goto do_SseReRg;
3472 case Iop_Max8Ux16: op = Asse_MAX8U; goto do_SseReRg;
3473 case Iop_Min16Sx8: op = Asse_MIN16S; goto do_SseReRg;
3474 case Iop_Min8Ux16: op = Asse_MIN8U; goto do_SseReRg;
3475 case Iop_MulHi16Ux8: op = Asse_MULHI16U; goto do_SseReRg;
3476 case Iop_MulHi16Sx8: op = Asse_MULHI16S; goto do_SseReRg;
3477 case Iop_Mul16x8: op = Asse_MUL16; goto do_SseReRg;
3478 case Iop_Sub8x16: op = Asse_SUB8; goto do_SseReRg;
3479 case Iop_Sub16x8: op = Asse_SUB16; goto do_SseReRg;
3480 case Iop_Sub32x4: op = Asse_SUB32; goto do_SseReRg;
3481 case Iop_Sub64x2: op = Asse_SUB64; goto do_SseReRg;
3482 case Iop_QSub8Sx16: op = Asse_QSUB8S; goto do_SseReRg;
3483 case Iop_QSub16Sx8: op = Asse_QSUB16S; goto do_SseReRg;
3484 case Iop_QSub8Ux16: op = Asse_QSUB8U; goto do_SseReRg;
3485 case Iop_QSub16Ux8: op = Asse_QSUB16U; goto do_SseReRg;
3486 do_SseReRg: {
3487 HReg arg1 = iselVecExpr(env, e->Iex.Binop.arg1);
3488 HReg arg2 = iselVecExpr(env, e->Iex.Binop.arg2);
3489 HReg dst = newVRegV(env);
3490 if (arg1isEReg) {
3491 addInstr(env, mk_vMOVsd_RR(arg2, dst));
3492 addInstr(env, AMD64Instr_SseReRg(op, arg1, dst));
3493 } else {
3494 addInstr(env, mk_vMOVsd_RR(arg1, dst));
3495 addInstr(env, AMD64Instr_SseReRg(op, arg2, dst));
3496 }
3497 return dst;
3498 }
3499
3500 case Iop_ShlN16x8: op = Asse_SHL16; goto do_SseShift;
3501 case Iop_ShlN32x4: op = Asse_SHL32; goto do_SseShift;
3502 case Iop_ShlN64x2: op = Asse_SHL64; goto do_SseShift;
3503 case Iop_SarN16x8: op = Asse_SAR16; goto do_SseShift;
3504 case Iop_SarN32x4: op = Asse_SAR32; goto do_SseShift;
3505 case Iop_ShrN16x8: op = Asse_SHR16; goto do_SseShift;
3506 case Iop_ShrN32x4: op = Asse_SHR32; goto do_SseShift;
3507 case Iop_ShrN64x2: op = Asse_SHR64; goto do_SseShift;
3508 do_SseShift: {
3509 HReg greg = iselVecExpr(env, e->Iex.Binop.arg1);
3510 AMD64RMI* rmi = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
3511 AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP());
3512 HReg ereg = newVRegV(env);
3513 HReg dst = newVRegV(env);
3514 addInstr(env, AMD64Instr_Push(AMD64RMI_Imm(0)));
3515 addInstr(env, AMD64Instr_Push(rmi));
3516 addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, ereg, rsp0));
3517 addInstr(env, mk_vMOVsd_RR(greg, dst));
3518 addInstr(env, AMD64Instr_SseReRg(op, ereg, dst));
3519 add_to_rsp(env, 16);
3520 return dst;
3521 }
3522
3523 case Iop_Mul32x4: fn = (HWord)h_generic_calc_Mul32x4;
3524 goto do_SseAssistedBinary;
3525 case Iop_Max32Sx4: fn = (HWord)h_generic_calc_Max32Sx4;
3526 goto do_SseAssistedBinary;
3527 case Iop_Min32Sx4: fn = (HWord)h_generic_calc_Min32Sx4;
3528 goto do_SseAssistedBinary;
3529 case Iop_Max32Ux4: fn = (HWord)h_generic_calc_Max32Ux4;
3530 goto do_SseAssistedBinary;
3531 case Iop_Min32Ux4: fn = (HWord)h_generic_calc_Min32Ux4;
3532 goto do_SseAssistedBinary;
3533 case Iop_Max16Ux8: fn = (HWord)h_generic_calc_Max16Ux8;
3534 goto do_SseAssistedBinary;
3535 case Iop_Min16Ux8: fn = (HWord)h_generic_calc_Min16Ux8;
3536 goto do_SseAssistedBinary;
3537 case Iop_Max8Sx16: fn = (HWord)h_generic_calc_Max8Sx16;
3538 goto do_SseAssistedBinary;
3539 case Iop_Min8Sx16: fn = (HWord)h_generic_calc_Min8Sx16;
3540 goto do_SseAssistedBinary;
3541 case Iop_CmpEQ64x2: fn = (HWord)h_generic_calc_CmpEQ64x2;
3542 goto do_SseAssistedBinary;
3543 case Iop_CmpGT64Sx2: fn = (HWord)h_generic_calc_CmpGT64Sx2;
3544 goto do_SseAssistedBinary;
3545 case Iop_Perm32x4: fn = (HWord)h_generic_calc_Perm32x4;
3546 goto do_SseAssistedBinary;
3547 case Iop_QNarrowBin32Sto16Ux8:
3548 fn = (HWord)h_generic_calc_QNarrowBin32Sto16Ux8;
3549 goto do_SseAssistedBinary;
3550 case Iop_NarrowBin16to8x16:
3551 fn = (HWord)h_generic_calc_NarrowBin16to8x16;
3552 goto do_SseAssistedBinary;
3553 case Iop_NarrowBin32to16x8:
3554 fn = (HWord)h_generic_calc_NarrowBin32to16x8;
3555 goto do_SseAssistedBinary;
3556 do_SseAssistedBinary: {
3557 /* RRRufff! RRRufff code is what we're generating here. Oh
3558 well. */
3559 vassert(fn != 0);
3560 HReg dst = newVRegV(env);
3561 HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
3562 HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
3563 HReg argp = newVRegI(env);
3564 /* subq $112, %rsp -- make a space*/
3565 sub_from_rsp(env, 112);
3566 /* leaq 48(%rsp), %r_argp -- point into it */
3567 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(48, hregAMD64_RSP()),
3568 argp));
3569 /* andq $-16, %r_argp -- 16-align the pointer */
3570 addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
3571 AMD64RMI_Imm( ~(UInt)15 ),
3572 argp));
3573 /* Prepare 3 arg regs:
3574 leaq 0(%r_argp), %rdi
3575 leaq 16(%r_argp), %rsi
3576 leaq 32(%r_argp), %rdx
3577 */
3578 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, argp),
3579 hregAMD64_RDI()));
3580 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(16, argp),
3581 hregAMD64_RSI()));
3582 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(32, argp),
3583 hregAMD64_RDX()));
3584 /* Store the two args, at (%rsi) and (%rdx):
3585 movupd %argL, 0(%rsi)
3586 movupd %argR, 0(%rdx)
3587 */
3588 addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argL,
3589 AMD64AMode_IR(0, hregAMD64_RSI())));
3590 addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argR,
3591 AMD64AMode_IR(0, hregAMD64_RDX())));
3592 /* call the helper */
3593 addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn,
3594 3, mk_RetLoc_simple(RLPri_None) ));
3595 /* fetch the result from memory, using %r_argp, which the
3596 register allocator will keep alive across the call. */
3597 addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dst,
3598 AMD64AMode_IR(0, argp)));
3599 /* and finally, clear the space */
3600 add_to_rsp(env, 112);
3601 return dst;
3602 }
3603
3604 case Iop_SarN64x2: fn = (HWord)h_generic_calc_SarN64x2;
3605 goto do_SseAssistedVectorAndScalar;
3606 case Iop_SarN8x16: fn = (HWord)h_generic_calc_SarN8x16;
3607 goto do_SseAssistedVectorAndScalar;
3608 do_SseAssistedVectorAndScalar: {
3609 /* RRRufff! RRRufff code is what we're generating here. Oh
3610 well. */
3611 vassert(fn != 0);
3612 HReg dst = newVRegV(env);
3613 HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
3614 HReg argR = iselIntExpr_R(env, e->Iex.Binop.arg2);
3615 HReg argp = newVRegI(env);
3616 /* subq $112, %rsp -- make a space*/
3617 sub_from_rsp(env, 112);
3618 /* leaq 48(%rsp), %r_argp -- point into it */
3619 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(48, hregAMD64_RSP()),
3620 argp));
3621 /* andq $-16, %r_argp -- 16-align the pointer */
3622 addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
3623 AMD64RMI_Imm( ~(UInt)15 ),
3624 argp));
3625 /* Prepare 2 vector arg regs:
3626 leaq 0(%r_argp), %rdi
3627 leaq 16(%r_argp), %rsi
3628 */
3629 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, argp),
3630 hregAMD64_RDI()));
3631 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(16, argp),
3632 hregAMD64_RSI()));
3633 /* Store the vector arg, at (%rsi):
3634 movupd %argL, 0(%rsi)
3635 */
3636 addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argL,
3637 AMD64AMode_IR(0, hregAMD64_RSI())));
3638 /* And get the scalar value into rdx */
3639 addInstr(env, mk_iMOVsd_RR(argR, hregAMD64_RDX()));
3640
3641 /* call the helper */
3642 addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn,
3643 3, mk_RetLoc_simple(RLPri_None) ));
3644 /* fetch the result from memory, using %r_argp, which the
3645 register allocator will keep alive across the call. */
3646 addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dst,
3647 AMD64AMode_IR(0, argp)));
3648 /* and finally, clear the space */
3649 add_to_rsp(env, 112);
3650 return dst;
3651 }
3652
3653 default:
3654 break;
3655 } /* switch (e->Iex.Binop.op) */
3656 } /* if (e->tag == Iex_Binop) */
3657
3658 if (e->tag == Iex_Triop) {
3659 IRTriop *triop = e->Iex.Triop.details;
3660 switch (triop->op) {
3661
3662 case Iop_Add64Fx2: op = Asse_ADDF; goto do_64Fx2_w_rm;
3663 case Iop_Sub64Fx2: op = Asse_SUBF; goto do_64Fx2_w_rm;
3664 case Iop_Mul64Fx2: op = Asse_MULF; goto do_64Fx2_w_rm;
3665 case Iop_Div64Fx2: op = Asse_DIVF; goto do_64Fx2_w_rm;
3666 do_64Fx2_w_rm:
3667 {
3668 HReg argL = iselVecExpr(env, triop->arg2);
3669 HReg argR = iselVecExpr(env, triop->arg3);
3670 HReg dst = newVRegV(env);
3671 addInstr(env, mk_vMOVsd_RR(argL, dst));
3672 /* XXXROUNDINGFIXME */
3673 /* set roundingmode here */
3674 addInstr(env, AMD64Instr_Sse64Fx2(op, argR, dst));
3675 return dst;
3676 }
3677
3678 case Iop_Add32Fx4: op = Asse_ADDF; goto do_32Fx4_w_rm;
3679 case Iop_Sub32Fx4: op = Asse_SUBF; goto do_32Fx4_w_rm;
3680 case Iop_Mul32Fx4: op = Asse_MULF; goto do_32Fx4_w_rm;
3681 case Iop_Div32Fx4: op = Asse_DIVF; goto do_32Fx4_w_rm;
3682 do_32Fx4_w_rm:
3683 {
3684 HReg argL = iselVecExpr(env, triop->arg2);
3685 HReg argR = iselVecExpr(env, triop->arg3);
3686 HReg dst = newVRegV(env);
3687 addInstr(env, mk_vMOVsd_RR(argL, dst));
3688 /* XXXROUNDINGFIXME */
3689 /* set roundingmode here */
3690 addInstr(env, AMD64Instr_Sse32Fx4(op, argR, dst));
3691 return dst;
3692 }
3693
3694 default:
3695 break;
3696 } /* switch (triop->op) */
3697 } /* if (e->tag == Iex_Triop) */
3698
3699 if (e->tag == Iex_ITE) { // VFD
3700 HReg r1 = iselVecExpr(env, e->Iex.ITE.iftrue);
3701 HReg r0 = iselVecExpr(env, e->Iex.ITE.iffalse);
3702 HReg dst = newVRegV(env);
3703 addInstr(env, mk_vMOVsd_RR(r1,dst));
3704 AMD64CondCode cc = iselCondCode(env, e->Iex.ITE.cond);
3705 addInstr(env, AMD64Instr_SseCMov(cc ^ 1, r0, dst));
3706 return dst;
3707 }
3708
3709 //vec_fail:
3710 vex_printf("iselVecExpr (amd64, subarch = %s): can't reduce\n",
3711 LibVEX_ppVexHwCaps(VexArchAMD64, env->hwcaps));
3712 ppIRExpr(e);
3713 vpanic("iselVecExpr_wrk");
3714 }
3715
3716
3717 /*---------------------------------------------------------*/
3718 /*--- ISEL: SIMD (V256) expressions, into 2 XMM regs. --*/
3719 /*---------------------------------------------------------*/
3720
iselDVecExpr(HReg * rHi,HReg * rLo,ISelEnv * env,IRExpr * e)3721 static void iselDVecExpr ( /*OUT*/HReg* rHi, /*OUT*/HReg* rLo,
3722 ISelEnv* env, IRExpr* e )
3723 {
3724 iselDVecExpr_wrk( rHi, rLo, env, e );
3725 # if 0
3726 vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
3727 # endif
3728 vassert(hregClass(*rHi) == HRcVec128);
3729 vassert(hregClass(*rLo) == HRcVec128);
3730 vassert(hregIsVirtual(*rHi));
3731 vassert(hregIsVirtual(*rLo));
3732 }
3733
3734
3735 /* DO NOT CALL THIS DIRECTLY */
iselDVecExpr_wrk(HReg * rHi,HReg * rLo,ISelEnv * env,IRExpr * e)3736 static void iselDVecExpr_wrk ( /*OUT*/HReg* rHi, /*OUT*/HReg* rLo,
3737 ISelEnv* env, IRExpr* e )
3738 {
3739 HWord fn = 0; /* address of helper fn, if required */
3740 vassert(e);
3741 IRType ty = typeOfIRExpr(env->type_env,e);
3742 vassert(ty == Ity_V256);
3743
3744 AMD64SseOp op = Asse_INVALID;
3745
3746 /* read 256-bit IRTemp */
3747 if (e->tag == Iex_RdTmp) {
3748 lookupIRTempPair( rHi, rLo, env, e->Iex.RdTmp.tmp);
3749 return;
3750 }
3751
3752 if (e->tag == Iex_Get) {
3753 HReg vHi = newVRegV(env);
3754 HReg vLo = newVRegV(env);
3755 HReg rbp = hregAMD64_RBP();
3756 AMD64AMode* am0 = AMD64AMode_IR(e->Iex.Get.offset + 0, rbp);
3757 AMD64AMode* am16 = AMD64AMode_IR(e->Iex.Get.offset + 16, rbp);
3758 addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, vLo, am0));
3759 addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, vHi, am16));
3760 *rHi = vHi;
3761 *rLo = vLo;
3762 return;
3763 }
3764
3765 if (e->tag == Iex_Load) {
3766 HReg vHi = newVRegV(env);
3767 HReg vLo = newVRegV(env);
3768 HReg rA = iselIntExpr_R(env, e->Iex.Load.addr);
3769 AMD64AMode* am0 = AMD64AMode_IR(0, rA);
3770 AMD64AMode* am16 = AMD64AMode_IR(16, rA);
3771 addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, vLo, am0));
3772 addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, vHi, am16));
3773 *rHi = vHi;
3774 *rLo = vLo;
3775 return;
3776 }
3777
3778 if (e->tag == Iex_Const) {
3779 vassert(e->Iex.Const.con->tag == Ico_V256);
3780 switch (e->Iex.Const.con->Ico.V256) {
3781 case 0x00000000: {
3782 HReg vHi = generate_zeroes_V128(env);
3783 HReg vLo = newVRegV(env);
3784 addInstr(env, mk_vMOVsd_RR(vHi, vLo));
3785 *rHi = vHi;
3786 *rLo = vLo;
3787 return;
3788 }
3789 default:
3790 break; /* give up. Until such time as is necessary. */
3791 }
3792 }
3793
3794 if (e->tag == Iex_Unop) {
3795 switch (e->Iex.Unop.op) {
3796
3797 case Iop_NotV256: {
3798 HReg argHi, argLo;
3799 iselDVecExpr(&argHi, &argLo, env, e->Iex.Unop.arg);
3800 *rHi = do_sse_NotV128(env, argHi);
3801 *rLo = do_sse_NotV128(env, argLo);
3802 return;
3803 }
3804
3805 case Iop_RecipEst32Fx8: op = Asse_RCPF; goto do_32Fx8_unary;
3806 case Iop_Sqrt32Fx8: op = Asse_SQRTF; goto do_32Fx8_unary;
3807 case Iop_RSqrtEst32Fx8: op = Asse_RSQRTF; goto do_32Fx8_unary;
3808 do_32Fx8_unary:
3809 {
3810 HReg argHi, argLo;
3811 iselDVecExpr(&argHi, &argLo, env, e->Iex.Unop.arg);
3812 HReg dstHi = newVRegV(env);
3813 HReg dstLo = newVRegV(env);
3814 addInstr(env, AMD64Instr_Sse32Fx4(op, argHi, dstHi));
3815 addInstr(env, AMD64Instr_Sse32Fx4(op, argLo, dstLo));
3816 *rHi = dstHi;
3817 *rLo = dstLo;
3818 return;
3819 }
3820
3821 case Iop_Sqrt64Fx4: op = Asse_SQRTF; goto do_64Fx4_unary;
3822 do_64Fx4_unary:
3823 {
3824 HReg argHi, argLo;
3825 iselDVecExpr(&argHi, &argLo, env, e->Iex.Unop.arg);
3826 HReg dstHi = newVRegV(env);
3827 HReg dstLo = newVRegV(env);
3828 addInstr(env, AMD64Instr_Sse64Fx2(op, argHi, dstHi));
3829 addInstr(env, AMD64Instr_Sse64Fx2(op, argLo, dstLo));
3830 *rHi = dstHi;
3831 *rLo = dstLo;
3832 return;
3833 }
3834
3835 case Iop_CmpNEZ64x4: {
3836 /* We can use SSE2 instructions for this. */
3837 /* Same scheme as Iop_CmpNEZ64x2, except twice as wide
3838 (obviously). See comment on Iop_CmpNEZ64x2 for
3839 explanation of what's going on here. */
3840 HReg argHi, argLo;
3841 iselDVecExpr(&argHi, &argLo, env, e->Iex.Unop.arg);
3842 HReg tmpHi = generate_zeroes_V128(env);
3843 HReg tmpLo = newVRegV(env);
3844 addInstr(env, mk_vMOVsd_RR(tmpHi, tmpLo));
3845 HReg dstHi = newVRegV(env);
3846 HReg dstLo = newVRegV(env);
3847 addInstr(env, AMD64Instr_SseReRg(Asse_CMPEQ32, argHi, tmpHi));
3848 addInstr(env, AMD64Instr_SseReRg(Asse_CMPEQ32, argLo, tmpLo));
3849 tmpHi = do_sse_NotV128(env, tmpHi);
3850 tmpLo = do_sse_NotV128(env, tmpLo);
3851 addInstr(env, AMD64Instr_SseShuf(0xB1, tmpHi, dstHi));
3852 addInstr(env, AMD64Instr_SseShuf(0xB1, tmpLo, dstLo));
3853 addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmpHi, dstHi));
3854 addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmpLo, dstLo));
3855 *rHi = dstHi;
3856 *rLo = dstLo;
3857 return;
3858 }
3859
3860 case Iop_CmpNEZ32x8: op = Asse_CMPEQ32; goto do_CmpNEZ_vector;
3861 case Iop_CmpNEZ16x16: op = Asse_CMPEQ16; goto do_CmpNEZ_vector;
3862 case Iop_CmpNEZ8x32: op = Asse_CMPEQ8; goto do_CmpNEZ_vector;
3863 do_CmpNEZ_vector:
3864 {
3865 HReg argHi, argLo;
3866 iselDVecExpr(&argHi, &argLo, env, e->Iex.Unop.arg);
3867 HReg tmpHi = newVRegV(env);
3868 HReg tmpLo = newVRegV(env);
3869 HReg zero = generate_zeroes_V128(env);
3870 HReg dstHi, dstLo;
3871 addInstr(env, mk_vMOVsd_RR(argHi, tmpHi));
3872 addInstr(env, mk_vMOVsd_RR(argLo, tmpLo));
3873 addInstr(env, AMD64Instr_SseReRg(op, zero, tmpHi));
3874 addInstr(env, AMD64Instr_SseReRg(op, zero, tmpLo));
3875 dstHi = do_sse_NotV128(env, tmpHi);
3876 dstLo = do_sse_NotV128(env, tmpLo);
3877 *rHi = dstHi;
3878 *rLo = dstLo;
3879 return;
3880 }
3881
3882 default:
3883 break;
3884 } /* switch (e->Iex.Unop.op) */
3885 } /* if (e->tag == Iex_Unop) */
3886
3887 if (e->tag == Iex_Binop) {
3888 switch (e->Iex.Binop.op) {
3889
3890 case Iop_Max64Fx4: op = Asse_MAXF; goto do_64Fx4;
3891 case Iop_Min64Fx4: op = Asse_MINF; goto do_64Fx4;
3892 do_64Fx4:
3893 {
3894 HReg argLhi, argLlo, argRhi, argRlo;
3895 iselDVecExpr(&argLhi, &argLlo, env, e->Iex.Binop.arg1);
3896 iselDVecExpr(&argRhi, &argRlo, env, e->Iex.Binop.arg2);
3897 HReg dstHi = newVRegV(env);
3898 HReg dstLo = newVRegV(env);
3899 addInstr(env, mk_vMOVsd_RR(argLhi, dstHi));
3900 addInstr(env, mk_vMOVsd_RR(argLlo, dstLo));
3901 addInstr(env, AMD64Instr_Sse64Fx2(op, argRhi, dstHi));
3902 addInstr(env, AMD64Instr_Sse64Fx2(op, argRlo, dstLo));
3903 *rHi = dstHi;
3904 *rLo = dstLo;
3905 return;
3906 }
3907
3908 case Iop_Max32Fx8: op = Asse_MAXF; goto do_32Fx8;
3909 case Iop_Min32Fx8: op = Asse_MINF; goto do_32Fx8;
3910 do_32Fx8:
3911 {
3912 HReg argLhi, argLlo, argRhi, argRlo;
3913 iselDVecExpr(&argLhi, &argLlo, env, e->Iex.Binop.arg1);
3914 iselDVecExpr(&argRhi, &argRlo, env, e->Iex.Binop.arg2);
3915 HReg dstHi = newVRegV(env);
3916 HReg dstLo = newVRegV(env);
3917 addInstr(env, mk_vMOVsd_RR(argLhi, dstHi));
3918 addInstr(env, mk_vMOVsd_RR(argLlo, dstLo));
3919 addInstr(env, AMD64Instr_Sse32Fx4(op, argRhi, dstHi));
3920 addInstr(env, AMD64Instr_Sse32Fx4(op, argRlo, dstLo));
3921 *rHi = dstHi;
3922 *rLo = dstLo;
3923 return;
3924 }
3925
3926 case Iop_AndV256: op = Asse_AND; goto do_SseReRg;
3927 case Iop_OrV256: op = Asse_OR; goto do_SseReRg;
3928 case Iop_XorV256: op = Asse_XOR; goto do_SseReRg;
3929 case Iop_Add8x32: op = Asse_ADD8; goto do_SseReRg;
3930 case Iop_Add16x16: op = Asse_ADD16; goto do_SseReRg;
3931 case Iop_Add32x8: op = Asse_ADD32; goto do_SseReRg;
3932 case Iop_Add64x4: op = Asse_ADD64; goto do_SseReRg;
3933 case Iop_QAdd8Sx32: op = Asse_QADD8S; goto do_SseReRg;
3934 case Iop_QAdd16Sx16: op = Asse_QADD16S; goto do_SseReRg;
3935 case Iop_QAdd8Ux32: op = Asse_QADD8U; goto do_SseReRg;
3936 case Iop_QAdd16Ux16: op = Asse_QADD16U; goto do_SseReRg;
3937 case Iop_Avg8Ux32: op = Asse_AVG8U; goto do_SseReRg;
3938 case Iop_Avg16Ux16: op = Asse_AVG16U; goto do_SseReRg;
3939 case Iop_CmpEQ8x32: op = Asse_CMPEQ8; goto do_SseReRg;
3940 case Iop_CmpEQ16x16: op = Asse_CMPEQ16; goto do_SseReRg;
3941 case Iop_CmpEQ32x8: op = Asse_CMPEQ32; goto do_SseReRg;
3942 case Iop_CmpGT8Sx32: op = Asse_CMPGT8S; goto do_SseReRg;
3943 case Iop_CmpGT16Sx16: op = Asse_CMPGT16S; goto do_SseReRg;
3944 case Iop_CmpGT32Sx8: op = Asse_CMPGT32S; goto do_SseReRg;
3945 case Iop_Max16Sx16: op = Asse_MAX16S; goto do_SseReRg;
3946 case Iop_Max8Ux32: op = Asse_MAX8U; goto do_SseReRg;
3947 case Iop_Min16Sx16: op = Asse_MIN16S; goto do_SseReRg;
3948 case Iop_Min8Ux32: op = Asse_MIN8U; goto do_SseReRg;
3949 case Iop_MulHi16Ux16: op = Asse_MULHI16U; goto do_SseReRg;
3950 case Iop_MulHi16Sx16: op = Asse_MULHI16S; goto do_SseReRg;
3951 case Iop_Mul16x16: op = Asse_MUL16; goto do_SseReRg;
3952 case Iop_Sub8x32: op = Asse_SUB8; goto do_SseReRg;
3953 case Iop_Sub16x16: op = Asse_SUB16; goto do_SseReRg;
3954 case Iop_Sub32x8: op = Asse_SUB32; goto do_SseReRg;
3955 case Iop_Sub64x4: op = Asse_SUB64; goto do_SseReRg;
3956 case Iop_QSub8Sx32: op = Asse_QSUB8S; goto do_SseReRg;
3957 case Iop_QSub16Sx16: op = Asse_QSUB16S; goto do_SseReRg;
3958 case Iop_QSub8Ux32: op = Asse_QSUB8U; goto do_SseReRg;
3959 case Iop_QSub16Ux16: op = Asse_QSUB16U; goto do_SseReRg;
3960 do_SseReRg:
3961 {
3962 HReg argLhi, argLlo, argRhi, argRlo;
3963 iselDVecExpr(&argLhi, &argLlo, env, e->Iex.Binop.arg1);
3964 iselDVecExpr(&argRhi, &argRlo, env, e->Iex.Binop.arg2);
3965 HReg dstHi = newVRegV(env);
3966 HReg dstLo = newVRegV(env);
3967 addInstr(env, mk_vMOVsd_RR(argLhi, dstHi));
3968 addInstr(env, mk_vMOVsd_RR(argLlo, dstLo));
3969 addInstr(env, AMD64Instr_SseReRg(op, argRhi, dstHi));
3970 addInstr(env, AMD64Instr_SseReRg(op, argRlo, dstLo));
3971 *rHi = dstHi;
3972 *rLo = dstLo;
3973 return;
3974 }
3975
3976 case Iop_ShlN16x16: op = Asse_SHL16; goto do_SseShift;
3977 case Iop_ShlN32x8: op = Asse_SHL32; goto do_SseShift;
3978 case Iop_ShlN64x4: op = Asse_SHL64; goto do_SseShift;
3979 case Iop_SarN16x16: op = Asse_SAR16; goto do_SseShift;
3980 case Iop_SarN32x8: op = Asse_SAR32; goto do_SseShift;
3981 case Iop_ShrN16x16: op = Asse_SHR16; goto do_SseShift;
3982 case Iop_ShrN32x8: op = Asse_SHR32; goto do_SseShift;
3983 case Iop_ShrN64x4: op = Asse_SHR64; goto do_SseShift;
3984 do_SseShift: {
3985 HReg gregHi, gregLo;
3986 iselDVecExpr(&gregHi, &gregLo, env, e->Iex.Binop.arg1);
3987 AMD64RMI* rmi = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
3988 AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP());
3989 HReg ereg = newVRegV(env);
3990 HReg dstHi = newVRegV(env);
3991 HReg dstLo = newVRegV(env);
3992 addInstr(env, AMD64Instr_Push(AMD64RMI_Imm(0)));
3993 addInstr(env, AMD64Instr_Push(rmi));
3994 addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, ereg, rsp0));
3995 addInstr(env, mk_vMOVsd_RR(gregHi, dstHi));
3996 addInstr(env, AMD64Instr_SseReRg(op, ereg, dstHi));
3997 addInstr(env, mk_vMOVsd_RR(gregLo, dstLo));
3998 addInstr(env, AMD64Instr_SseReRg(op, ereg, dstLo));
3999 add_to_rsp(env, 16);
4000 *rHi = dstHi;
4001 *rLo = dstLo;
4002 return;
4003 }
4004
4005 case Iop_V128HLtoV256: {
4006 *rHi = iselVecExpr(env, e->Iex.Binop.arg1);
4007 *rLo = iselVecExpr(env, e->Iex.Binop.arg2);
4008 return;
4009 }
4010
4011 case Iop_Mul32x8: fn = (HWord)h_generic_calc_Mul32x4;
4012 goto do_SseAssistedBinary;
4013 case Iop_Max32Sx8: fn = (HWord)h_generic_calc_Max32Sx4;
4014 goto do_SseAssistedBinary;
4015 case Iop_Min32Sx8: fn = (HWord)h_generic_calc_Min32Sx4;
4016 goto do_SseAssistedBinary;
4017 case Iop_Max32Ux8: fn = (HWord)h_generic_calc_Max32Ux4;
4018 goto do_SseAssistedBinary;
4019 case Iop_Min32Ux8: fn = (HWord)h_generic_calc_Min32Ux4;
4020 goto do_SseAssistedBinary;
4021 case Iop_Max16Ux16: fn = (HWord)h_generic_calc_Max16Ux8;
4022 goto do_SseAssistedBinary;
4023 case Iop_Min16Ux16: fn = (HWord)h_generic_calc_Min16Ux8;
4024 goto do_SseAssistedBinary;
4025 case Iop_Max8Sx32: fn = (HWord)h_generic_calc_Max8Sx16;
4026 goto do_SseAssistedBinary;
4027 case Iop_Min8Sx32: fn = (HWord)h_generic_calc_Min8Sx16;
4028 goto do_SseAssistedBinary;
4029 case Iop_CmpEQ64x4: fn = (HWord)h_generic_calc_CmpEQ64x2;
4030 goto do_SseAssistedBinary;
4031 case Iop_CmpGT64Sx4: fn = (HWord)h_generic_calc_CmpGT64Sx2;
4032 goto do_SseAssistedBinary;
4033 do_SseAssistedBinary: {
4034 /* RRRufff! RRRufff code is what we're generating here. Oh
4035 well. */
4036 vassert(fn != 0);
4037 HReg dstHi = newVRegV(env);
4038 HReg dstLo = newVRegV(env);
4039 HReg argLhi, argLlo, argRhi, argRlo;
4040 iselDVecExpr(&argLhi, &argLlo, env, e->Iex.Binop.arg1);
4041 iselDVecExpr(&argRhi, &argRlo, env, e->Iex.Binop.arg2);
4042 HReg argp = newVRegI(env);
4043 /* subq $160, %rsp -- make a space*/
4044 sub_from_rsp(env, 160);
4045 /* leaq 48(%rsp), %r_argp -- point into it */
4046 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(48, hregAMD64_RSP()),
4047 argp));
4048 /* andq $-16, %r_argp -- 16-align the pointer */
4049 addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
4050 AMD64RMI_Imm( ~(UInt)15 ),
4051 argp));
4052 /* Prepare 3 arg regs:
4053 leaq 0(%r_argp), %rdi
4054 leaq 16(%r_argp), %rsi
4055 leaq 32(%r_argp), %rdx
4056 */
4057 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, argp),
4058 hregAMD64_RDI()));
4059 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(16, argp),
4060 hregAMD64_RSI()));
4061 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(32, argp),
4062 hregAMD64_RDX()));
4063 /* Store the two high args, at (%rsi) and (%rdx):
4064 movupd %argLhi, 0(%rsi)
4065 movupd %argRhi, 0(%rdx)
4066 */
4067 addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argLhi,
4068 AMD64AMode_IR(0, hregAMD64_RSI())));
4069 addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argRhi,
4070 AMD64AMode_IR(0, hregAMD64_RDX())));
4071 /* Store the two low args, at 48(%rsi) and 48(%rdx):
4072 movupd %argLlo, 48(%rsi)
4073 movupd %argRlo, 48(%rdx)
4074 */
4075 addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argLlo,
4076 AMD64AMode_IR(48, hregAMD64_RSI())));
4077 addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argRlo,
4078 AMD64AMode_IR(48, hregAMD64_RDX())));
4079 /* call the helper */
4080 addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 3,
4081 mk_RetLoc_simple(RLPri_None) ));
4082 /* Prepare 3 arg regs:
4083 leaq 48(%r_argp), %rdi
4084 leaq 64(%r_argp), %rsi
4085 leaq 80(%r_argp), %rdx
4086 */
4087 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(48, argp),
4088 hregAMD64_RDI()));
4089 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(64, argp),
4090 hregAMD64_RSI()));
4091 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(80, argp),
4092 hregAMD64_RDX()));
4093 /* call the helper */
4094 addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 3,
4095 mk_RetLoc_simple(RLPri_None) ));
4096 /* fetch the result from memory, using %r_argp, which the
4097 register allocator will keep alive across the call. */
4098 addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dstHi,
4099 AMD64AMode_IR(0, argp)));
4100 addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dstLo,
4101 AMD64AMode_IR(48, argp)));
4102 /* and finally, clear the space */
4103 add_to_rsp(env, 160);
4104 *rHi = dstHi;
4105 *rLo = dstLo;
4106 return;
4107 }
4108
4109 case Iop_Perm32x8: fn = (HWord)h_generic_calc_Perm32x8;
4110 goto do_SseAssistedBinary256;
4111 do_SseAssistedBinary256: {
4112 /* RRRufff! RRRufff code is what we're generating here. Oh
4113 well. */
4114 vassert(fn != 0);
4115 HReg dstHi = newVRegV(env);
4116 HReg dstLo = newVRegV(env);
4117 HReg argLhi, argLlo, argRhi, argRlo;
4118 iselDVecExpr(&argLhi, &argLlo, env, e->Iex.Binop.arg1);
4119 iselDVecExpr(&argRhi, &argRlo, env, e->Iex.Binop.arg2);
4120 HReg argp = newVRegI(env);
4121 /* subq $160, %rsp -- make a space*/
4122 sub_from_rsp(env, 160);
4123 /* leaq 48(%rsp), %r_argp -- point into it */
4124 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(48, hregAMD64_RSP()),
4125 argp));
4126 /* andq $-16, %r_argp -- 16-align the pointer */
4127 addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
4128 AMD64RMI_Imm( ~(UInt)15 ),
4129 argp));
4130 /* Prepare 3 arg regs:
4131 leaq 0(%r_argp), %rdi
4132 leaq 32(%r_argp), %rsi
4133 leaq 64(%r_argp), %rdx
4134 */
4135 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, argp),
4136 hregAMD64_RDI()));
4137 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(32, argp),
4138 hregAMD64_RSI()));
4139 addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(64, argp),
4140 hregAMD64_RDX()));
4141 /* Store the two args, at (%rsi) and (%rdx):
4142 movupd %argLlo, 0(%rsi)
4143 movupd %argLhi, 16(%rsi)
4144 movupd %argRlo, 0(%rdx)
4145 movupd %argRhi, 16(%rdx)
4146 */
4147 addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argLlo,
4148 AMD64AMode_IR(0, hregAMD64_RSI())));
4149 addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argLhi,
4150 AMD64AMode_IR(16, hregAMD64_RSI())));
4151 addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argRlo,
4152 AMD64AMode_IR(0, hregAMD64_RDX())));
4153 addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argRhi,
4154 AMD64AMode_IR(16, hregAMD64_RDX())));
4155 /* call the helper */
4156 addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 3,
4157 mk_RetLoc_simple(RLPri_None) ));
4158 /* fetch the result from memory, using %r_argp, which the
4159 register allocator will keep alive across the call. */
4160 addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dstLo,
4161 AMD64AMode_IR(0, argp)));
4162 addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dstHi,
4163 AMD64AMode_IR(16, argp)));
4164 /* and finally, clear the space */
4165 add_to_rsp(env, 160);
4166 *rHi = dstHi;
4167 *rLo = dstLo;
4168 return;
4169 }
4170
4171 default:
4172 break;
4173 } /* switch (e->Iex.Binop.op) */
4174 } /* if (e->tag == Iex_Binop) */
4175
4176 if (e->tag == Iex_Triop) {
4177 IRTriop *triop = e->Iex.Triop.details;
4178 switch (triop->op) {
4179
4180 case Iop_Add64Fx4: op = Asse_ADDF; goto do_64Fx4_w_rm;
4181 case Iop_Sub64Fx4: op = Asse_SUBF; goto do_64Fx4_w_rm;
4182 case Iop_Mul64Fx4: op = Asse_MULF; goto do_64Fx4_w_rm;
4183 case Iop_Div64Fx4: op = Asse_DIVF; goto do_64Fx4_w_rm;
4184 do_64Fx4_w_rm:
4185 {
4186 HReg argLhi, argLlo, argRhi, argRlo;
4187 iselDVecExpr(&argLhi, &argLlo, env, triop->arg2);
4188 iselDVecExpr(&argRhi, &argRlo, env, triop->arg3);
4189 HReg dstHi = newVRegV(env);
4190 HReg dstLo = newVRegV(env);
4191 addInstr(env, mk_vMOVsd_RR(argLhi, dstHi));
4192 addInstr(env, mk_vMOVsd_RR(argLlo, dstLo));
4193 /* XXXROUNDINGFIXME */
4194 /* set roundingmode here */
4195 addInstr(env, AMD64Instr_Sse64Fx2(op, argRhi, dstHi));
4196 addInstr(env, AMD64Instr_Sse64Fx2(op, argRlo, dstLo));
4197 *rHi = dstHi;
4198 *rLo = dstLo;
4199 return;
4200 }
4201
4202 case Iop_Add32Fx8: op = Asse_ADDF; goto do_32Fx8_w_rm;
4203 case Iop_Sub32Fx8: op = Asse_SUBF; goto do_32Fx8_w_rm;
4204 case Iop_Mul32Fx8: op = Asse_MULF; goto do_32Fx8_w_rm;
4205 case Iop_Div32Fx8: op = Asse_DIVF; goto do_32Fx8_w_rm;
4206 do_32Fx8_w_rm:
4207 {
4208 HReg argLhi, argLlo, argRhi, argRlo;
4209 iselDVecExpr(&argLhi, &argLlo, env, triop->arg2);
4210 iselDVecExpr(&argRhi, &argRlo, env, triop->arg3);
4211 HReg dstHi = newVRegV(env);
4212 HReg dstLo = newVRegV(env);
4213 addInstr(env, mk_vMOVsd_RR(argLhi, dstHi));
4214 addInstr(env, mk_vMOVsd_RR(argLlo, dstLo));
4215 /* XXXROUNDINGFIXME */
4216 /* set roundingmode here */
4217 addInstr(env, AMD64Instr_Sse32Fx4(op, argRhi, dstHi));
4218 addInstr(env, AMD64Instr_Sse32Fx4(op, argRlo, dstLo));
4219 *rHi = dstHi;
4220 *rLo = dstLo;
4221 return;
4222 }
4223
4224 default:
4225 break;
4226 } /* switch (triop->op) */
4227 } /* if (e->tag == Iex_Triop) */
4228
4229
4230 if (e->tag == Iex_Qop && e->Iex.Qop.details->op == Iop_64x4toV256) {
4231 HReg rsp = hregAMD64_RSP();
4232 HReg vHi = newVRegV(env);
4233 HReg vLo = newVRegV(env);
4234 AMD64AMode* m8_rsp = AMD64AMode_IR(-8, rsp);
4235 AMD64AMode* m16_rsp = AMD64AMode_IR(-16, rsp);
4236 /* arg1 is the most significant (Q3), arg4 the least (Q0) */
4237 /* Get all the args into regs, before messing with the stack. */
4238 AMD64RI* q3 = iselIntExpr_RI(env, e->Iex.Qop.details->arg1);
4239 AMD64RI* q2 = iselIntExpr_RI(env, e->Iex.Qop.details->arg2);
4240 AMD64RI* q1 = iselIntExpr_RI(env, e->Iex.Qop.details->arg3);
4241 AMD64RI* q0 = iselIntExpr_RI(env, e->Iex.Qop.details->arg4);
4242 /* less significant lane (Q2) at the lower address (-16(rsp)) */
4243 addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, q3, m8_rsp));
4244 addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, q2, m16_rsp));
4245 addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, vHi, m16_rsp));
4246 /* and then the lower half .. */
4247 addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, q1, m8_rsp));
4248 addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, q0, m16_rsp));
4249 addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, vLo, m16_rsp));
4250 *rHi = vHi;
4251 *rLo = vLo;
4252 return;
4253 }
4254
4255 if (e->tag == Iex_ITE) {
4256 HReg r1Hi, r1Lo, r0Hi, r0Lo;
4257 iselDVecExpr(&r1Hi, &r1Lo, env, e->Iex.ITE.iftrue);
4258 iselDVecExpr(&r0Hi, &r0Lo, env, e->Iex.ITE.iffalse);
4259 HReg dstHi = newVRegV(env);
4260 HReg dstLo = newVRegV(env);
4261 addInstr(env, mk_vMOVsd_RR(r1Hi,dstHi));
4262 addInstr(env, mk_vMOVsd_RR(r1Lo,dstLo));
4263 AMD64CondCode cc = iselCondCode(env, e->Iex.ITE.cond);
4264 addInstr(env, AMD64Instr_SseCMov(cc ^ 1, r0Hi, dstHi));
4265 addInstr(env, AMD64Instr_SseCMov(cc ^ 1, r0Lo, dstLo));
4266 *rHi = dstHi;
4267 *rLo = dstLo;
4268 return;
4269 }
4270
4271 //avx_fail:
4272 vex_printf("iselDVecExpr (amd64, subarch = %s): can't reduce\n",
4273 LibVEX_ppVexHwCaps(VexArchAMD64, env->hwcaps));
4274 ppIRExpr(e);
4275 vpanic("iselDVecExpr_wrk");
4276 }
4277
4278
4279 /*---------------------------------------------------------*/
4280 /*--- ISEL: Statements ---*/
4281 /*---------------------------------------------------------*/
4282
iselStmt(ISelEnv * env,IRStmt * stmt)4283 static void iselStmt ( ISelEnv* env, IRStmt* stmt )
4284 {
4285 if (vex_traceflags & VEX_TRACE_VCODE) {
4286 vex_printf("\n-- ");
4287 ppIRStmt(stmt);
4288 vex_printf("\n");
4289 }
4290
4291 switch (stmt->tag) {
4292
4293 /* --------- LOADG (guarded load) --------- */
4294 case Ist_LoadG: {
4295 IRLoadG* lg = stmt->Ist.LoadG.details;
4296 if (lg->end != Iend_LE)
4297 goto stmt_fail;
4298
4299 UChar szB = 0; /* invalid */
4300 switch (lg->cvt) {
4301 case ILGop_Ident32: szB = 4; break;
4302 case ILGop_Ident64: szB = 8; break;
4303 default: break;
4304 }
4305 if (szB == 0)
4306 goto stmt_fail;
4307
4308 AMD64AMode* amAddr = iselIntExpr_AMode(env, lg->addr);
4309 HReg rAlt = iselIntExpr_R(env, lg->alt);
4310 HReg rDst = lookupIRTemp(env, lg->dst);
4311 /* Get the alt value into the dst. We'll do a conditional load
4312 which overwrites it -- or not -- with loaded data. */
4313 addInstr(env, mk_iMOVsd_RR(rAlt, rDst));
4314 AMD64CondCode cc = iselCondCode(env, lg->guard);
4315 addInstr(env, AMD64Instr_CLoad(cc, szB, amAddr, rDst));
4316 return;
4317 }
4318
4319 /* --------- STOREG (guarded store) --------- */
4320 case Ist_StoreG: {
4321 IRStoreG* sg = stmt->Ist.StoreG.details;
4322 if (sg->end != Iend_LE)
4323 goto stmt_fail;
4324
4325 UChar szB = 0; /* invalid */
4326 switch (typeOfIRExpr(env->type_env, sg->data)) {
4327 case Ity_I32: szB = 4; break;
4328 case Ity_I64: szB = 8; break;
4329 default: break;
4330 }
4331 if (szB == 0)
4332 goto stmt_fail;
4333
4334 AMD64AMode* amAddr = iselIntExpr_AMode(env, sg->addr);
4335 HReg rSrc = iselIntExpr_R(env, sg->data);
4336 AMD64CondCode cc = iselCondCode(env, sg->guard);
4337 addInstr(env, AMD64Instr_CStore(cc, szB, rSrc, amAddr));
4338 return;
4339 }
4340
4341 /* --------- STORE --------- */
4342 case Ist_Store: {
4343 IRType tya = typeOfIRExpr(env->type_env, stmt->Ist.Store.addr);
4344 IRType tyd = typeOfIRExpr(env->type_env, stmt->Ist.Store.data);
4345 IREndness end = stmt->Ist.Store.end;
4346
4347 if (tya != Ity_I64 || end != Iend_LE)
4348 goto stmt_fail;
4349
4350 if (tyd == Ity_I64) {
4351 AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
4352 AMD64RI* ri = iselIntExpr_RI(env, stmt->Ist.Store.data);
4353 addInstr(env, AMD64Instr_Alu64M(Aalu_MOV,ri,am));
4354 return;
4355 }
4356 if (tyd == Ity_I8 || tyd == Ity_I16 || tyd == Ity_I32) {
4357 AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
4358 HReg r = iselIntExpr_R(env, stmt->Ist.Store.data);
4359 addInstr(env, AMD64Instr_Store(
4360 toUChar(tyd==Ity_I8 ? 1 : (tyd==Ity_I16 ? 2 : 4)),
4361 r,am));
4362 return;
4363 }
4364 if (tyd == Ity_F64) {
4365 AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
4366 HReg r = iselDblExpr(env, stmt->Ist.Store.data);
4367 addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, r, am));
4368 return;
4369 }
4370 if (tyd == Ity_F32) {
4371 AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
4372 HReg r = iselFltExpr(env, stmt->Ist.Store.data);
4373 addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 4, r, am));
4374 return;
4375 }
4376 if (tyd == Ity_V128) {
4377 AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
4378 HReg r = iselVecExpr(env, stmt->Ist.Store.data);
4379 addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, r, am));
4380 return;
4381 }
4382 if (tyd == Ity_V256) {
4383 HReg rA = iselIntExpr_R(env, stmt->Ist.Store.addr);
4384 AMD64AMode* am0 = AMD64AMode_IR(0, rA);
4385 AMD64AMode* am16 = AMD64AMode_IR(16, rA);
4386 HReg vHi, vLo;
4387 iselDVecExpr(&vHi, &vLo, env, stmt->Ist.Store.data);
4388 addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vLo, am0));
4389 addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vHi, am16));
4390 return;
4391 }
4392 break;
4393 }
4394
4395 /* --------- PUT --------- */
4396 case Ist_Put: {
4397 IRType ty = typeOfIRExpr(env->type_env, stmt->Ist.Put.data);
4398 if (ty == Ity_I64) {
4399 /* We're going to write to memory, so compute the RHS into an
4400 AMD64RI. */
4401 AMD64RI* ri = iselIntExpr_RI(env, stmt->Ist.Put.data);
4402 addInstr(env,
4403 AMD64Instr_Alu64M(
4404 Aalu_MOV,
4405 ri,
4406 AMD64AMode_IR(stmt->Ist.Put.offset,
4407 hregAMD64_RBP())
4408 ));
4409 return;
4410 }
4411 if (ty == Ity_I8 || ty == Ity_I16 || ty == Ity_I32) {
4412 HReg r = iselIntExpr_R(env, stmt->Ist.Put.data);
4413 addInstr(env, AMD64Instr_Store(
4414 toUChar(ty==Ity_I8 ? 1 : (ty==Ity_I16 ? 2 : 4)),
4415 r,
4416 AMD64AMode_IR(stmt->Ist.Put.offset,
4417 hregAMD64_RBP())));
4418 return;
4419 }
4420 if (ty == Ity_F32) {
4421 HReg f32 = iselFltExpr(env, stmt->Ist.Put.data);
4422 AMD64AMode* am = AMD64AMode_IR(stmt->Ist.Put.offset, hregAMD64_RBP());
4423 set_SSE_rounding_default(env); /* paranoia */
4424 addInstr(env, AMD64Instr_SseLdSt( False/*store*/, 4, f32, am ));
4425 return;
4426 }
4427 if (ty == Ity_F64) {
4428 HReg f64 = iselDblExpr(env, stmt->Ist.Put.data);
4429 AMD64AMode* am = AMD64AMode_IR( stmt->Ist.Put.offset,
4430 hregAMD64_RBP() );
4431 addInstr(env, AMD64Instr_SseLdSt( False/*store*/, 8, f64, am ));
4432 return;
4433 }
4434 if (ty == Ity_V128) {
4435 HReg vec = iselVecExpr(env, stmt->Ist.Put.data);
4436 AMD64AMode* am = AMD64AMode_IR(stmt->Ist.Put.offset,
4437 hregAMD64_RBP());
4438 addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vec, am));
4439 return;
4440 }
4441 if (ty == Ity_V256) {
4442 HReg vHi, vLo;
4443 iselDVecExpr(&vHi, &vLo, env, stmt->Ist.Put.data);
4444 HReg rbp = hregAMD64_RBP();
4445 AMD64AMode* am0 = AMD64AMode_IR(stmt->Ist.Put.offset + 0, rbp);
4446 AMD64AMode* am16 = AMD64AMode_IR(stmt->Ist.Put.offset + 16, rbp);
4447 addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vLo, am0));
4448 addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vHi, am16));
4449 return;
4450 }
4451 break;
4452 }
4453
4454 /* --------- Indexed PUT --------- */
4455 case Ist_PutI: {
4456 IRPutI *puti = stmt->Ist.PutI.details;
4457
4458 AMD64AMode* am
4459 = genGuestArrayOffset(
4460 env, puti->descr,
4461 puti->ix, puti->bias );
4462
4463 IRType ty = typeOfIRExpr(env->type_env, puti->data);
4464 if (ty == Ity_F64) {
4465 HReg val = iselDblExpr(env, puti->data);
4466 addInstr(env, AMD64Instr_SseLdSt( False/*store*/, 8, val, am ));
4467 return;
4468 }
4469 if (ty == Ity_I8) {
4470 HReg r = iselIntExpr_R(env, puti->data);
4471 addInstr(env, AMD64Instr_Store( 1, r, am ));
4472 return;
4473 }
4474 if (ty == Ity_I64) {
4475 AMD64RI* ri = iselIntExpr_RI(env, puti->data);
4476 addInstr(env, AMD64Instr_Alu64M( Aalu_MOV, ri, am ));
4477 return;
4478 }
4479 break;
4480 }
4481
4482 /* --------- TMP --------- */
4483 case Ist_WrTmp: {
4484 IRTemp tmp = stmt->Ist.WrTmp.tmp;
4485 IRType ty = typeOfIRTemp(env->type_env, tmp);
4486
4487 /* optimisation: if stmt->Ist.WrTmp.data is Add64(..,..),
4488 compute it into an AMode and then use LEA. This usually
4489 produces fewer instructions, often because (for memcheck
4490 created IR) we get t = address-expression, (t is later used
4491 twice) and so doing this naturally turns address-expression
4492 back into an AMD64 amode. */
4493 if (ty == Ity_I64
4494 && stmt->Ist.WrTmp.data->tag == Iex_Binop
4495 && stmt->Ist.WrTmp.data->Iex.Binop.op == Iop_Add64) {
4496 AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.WrTmp.data);
4497 HReg dst = lookupIRTemp(env, tmp);
4498 if (am->tag == Aam_IR && am->Aam.IR.imm == 0) {
4499 /* Hmm, iselIntExpr_AMode wimped out and just computed the
4500 value into a register. Just emit a normal reg-reg move
4501 so reg-alloc can coalesce it away in the usual way. */
4502 HReg src = am->Aam.IR.reg;
4503 addInstr(env, AMD64Instr_Alu64R(Aalu_MOV, AMD64RMI_Reg(src), dst));
4504 } else {
4505 addInstr(env, AMD64Instr_Lea64(am,dst));
4506 }
4507 return;
4508 }
4509
4510 if (ty == Ity_I64 || ty == Ity_I32
4511 || ty == Ity_I16 || ty == Ity_I8) {
4512 AMD64RMI* rmi = iselIntExpr_RMI(env, stmt->Ist.WrTmp.data);
4513 HReg dst = lookupIRTemp(env, tmp);
4514 addInstr(env, AMD64Instr_Alu64R(Aalu_MOV,rmi,dst));
4515 return;
4516 }
4517 if (ty == Ity_I128) {
4518 HReg rHi, rLo, dstHi, dstLo;
4519 iselInt128Expr(&rHi,&rLo, env, stmt->Ist.WrTmp.data);
4520 lookupIRTempPair( &dstHi, &dstLo, env, tmp);
4521 addInstr(env, mk_iMOVsd_RR(rHi,dstHi) );
4522 addInstr(env, mk_iMOVsd_RR(rLo,dstLo) );
4523 return;
4524 }
4525 if (ty == Ity_I1) {
4526 AMD64CondCode cond = iselCondCode(env, stmt->Ist.WrTmp.data);
4527 HReg dst = lookupIRTemp(env, tmp);
4528 addInstr(env, AMD64Instr_Set64(cond, dst));
4529 return;
4530 }
4531 if (ty == Ity_F64) {
4532 HReg dst = lookupIRTemp(env, tmp);
4533 HReg src = iselDblExpr(env, stmt->Ist.WrTmp.data);
4534 addInstr(env, mk_vMOVsd_RR(src, dst));
4535 return;
4536 }
4537 if (ty == Ity_F32) {
4538 HReg dst = lookupIRTemp(env, tmp);
4539 HReg src = iselFltExpr(env, stmt->Ist.WrTmp.data);
4540 addInstr(env, mk_vMOVsd_RR(src, dst));
4541 return;
4542 }
4543 if (ty == Ity_V128) {
4544 HReg dst = lookupIRTemp(env, tmp);
4545 HReg src = iselVecExpr(env, stmt->Ist.WrTmp.data);
4546 addInstr(env, mk_vMOVsd_RR(src, dst));
4547 return;
4548 }
4549 if (ty == Ity_V256) {
4550 HReg rHi, rLo, dstHi, dstLo;
4551 iselDVecExpr(&rHi,&rLo, env, stmt->Ist.WrTmp.data);
4552 lookupIRTempPair( &dstHi, &dstLo, env, tmp);
4553 addInstr(env, mk_vMOVsd_RR(rHi,dstHi) );
4554 addInstr(env, mk_vMOVsd_RR(rLo,dstLo) );
4555 return;
4556 }
4557 break;
4558 }
4559
4560 /* --------- Call to DIRTY helper --------- */
4561 case Ist_Dirty: {
4562 IRDirty* d = stmt->Ist.Dirty.details;
4563
4564 /* Figure out the return type, if any. */
4565 IRType retty = Ity_INVALID;
4566 if (d->tmp != IRTemp_INVALID)
4567 retty = typeOfIRTemp(env->type_env, d->tmp);
4568
4569 /* Throw out any return types we don't know about. */
4570 Bool retty_ok = False;
4571 switch (retty) {
4572 case Ity_INVALID: /* function doesn't return anything */
4573 case Ity_I64: case Ity_I32: case Ity_I16: case Ity_I8:
4574 case Ity_V128: case Ity_V256:
4575 retty_ok = True; break;
4576 default:
4577 break;
4578 }
4579 if (!retty_ok)
4580 break; /* will go to stmt_fail: */
4581
4582 /* Marshal args, do the call, and set the return value to
4583 0x555..555 if this is a conditional call that returns a value
4584 and the call is skipped. */
4585 UInt addToSp = 0;
4586 RetLoc rloc = mk_RetLoc_INVALID();
4587 doHelperCall( &addToSp, &rloc, env, d->guard, d->cee, retty, d->args );
4588 vassert(is_sane_RetLoc(rloc));
4589
4590 /* Now figure out what to do with the returned value, if any. */
4591 switch (retty) {
4592 case Ity_INVALID: {
4593 /* No return value. Nothing to do. */
4594 vassert(d->tmp == IRTemp_INVALID);
4595 vassert(rloc.pri == RLPri_None);
4596 vassert(addToSp == 0);
4597 return;
4598 }
4599 case Ity_I64: case Ity_I32: case Ity_I16: case Ity_I8: {
4600 /* The returned value is in %rax. Park it in the register
4601 associated with tmp. */
4602 vassert(rloc.pri == RLPri_Int);
4603 vassert(addToSp == 0);
4604 HReg dst = lookupIRTemp(env, d->tmp);
4605 addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(),dst) );
4606 return;
4607 }
4608 case Ity_V128: {
4609 /* The returned value is on the stack, and rloc.spOff
4610 tells us where. Fish it off the stack and then move
4611 the stack pointer upwards to clear it, as directed by
4612 doHelperCall. */
4613 vassert(rloc.pri == RLPri_V128SpRel);
4614 vassert(addToSp >= 16);
4615 HReg dst = lookupIRTemp(env, d->tmp);
4616 AMD64AMode* am = AMD64AMode_IR(rloc.spOff, hregAMD64_RSP());
4617 addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 16, dst, am ));
4618 add_to_rsp(env, addToSp);
4619 return;
4620 }
4621 case Ity_V256: {
4622 /* See comments for Ity_V128. */
4623 vassert(rloc.pri == RLPri_V256SpRel);
4624 vassert(addToSp >= 32);
4625 HReg dstLo, dstHi;
4626 lookupIRTempPair(&dstHi, &dstLo, env, d->tmp);
4627 AMD64AMode* amLo = AMD64AMode_IR(rloc.spOff, hregAMD64_RSP());
4628 addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 16, dstLo, amLo ));
4629 AMD64AMode* amHi = AMD64AMode_IR(rloc.spOff+16, hregAMD64_RSP());
4630 addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 16, dstHi, amHi ));
4631 add_to_rsp(env, addToSp);
4632 return;
4633 }
4634 default:
4635 /*NOTREACHED*/
4636 vassert(0);
4637 }
4638 break;
4639 }
4640
4641 /* --------- MEM FENCE --------- */
4642 case Ist_MBE:
4643 switch (stmt->Ist.MBE.event) {
4644 case Imbe_Fence:
4645 addInstr(env, AMD64Instr_MFence());
4646 return;
4647 default:
4648 break;
4649 }
4650 break;
4651
4652 /* --------- ACAS --------- */
4653 case Ist_CAS:
4654 if (stmt->Ist.CAS.details->oldHi == IRTemp_INVALID) {
4655 /* "normal" singleton CAS */
4656 UChar sz;
4657 IRCAS* cas = stmt->Ist.CAS.details;
4658 IRType ty = typeOfIRExpr(env->type_env, cas->dataLo);
4659 /* get: cas->expd into %rax, and cas->data into %rbx */
4660 AMD64AMode* am = iselIntExpr_AMode(env, cas->addr);
4661 HReg rData = iselIntExpr_R(env, cas->dataLo);
4662 HReg rExpd = iselIntExpr_R(env, cas->expdLo);
4663 HReg rOld = lookupIRTemp(env, cas->oldLo);
4664 vassert(cas->expdHi == NULL);
4665 vassert(cas->dataHi == NULL);
4666 addInstr(env, mk_iMOVsd_RR(rExpd, rOld));
4667 addInstr(env, mk_iMOVsd_RR(rExpd, hregAMD64_RAX()));
4668 addInstr(env, mk_iMOVsd_RR(rData, hregAMD64_RBX()));
4669 switch (ty) {
4670 case Ity_I64: sz = 8; break;
4671 case Ity_I32: sz = 4; break;
4672 case Ity_I16: sz = 2; break;
4673 case Ity_I8: sz = 1; break;
4674 default: goto unhandled_cas;
4675 }
4676 addInstr(env, AMD64Instr_ACAS(am, sz));
4677 addInstr(env, AMD64Instr_CMov64(Acc_NZ, hregAMD64_RAX(), rOld));
4678 return;
4679 } else {
4680 /* double CAS */
4681 UChar sz;
4682 IRCAS* cas = stmt->Ist.CAS.details;
4683 IRType ty = typeOfIRExpr(env->type_env, cas->dataLo);
4684 /* only 32-bit and 64-bit allowed in this case */
4685 /* get: cas->expdLo into %rax, and cas->dataLo into %rbx */
4686 /* get: cas->expdHi into %rdx, and cas->dataHi into %rcx */
4687 AMD64AMode* am = iselIntExpr_AMode(env, cas->addr);
4688 HReg rDataHi = iselIntExpr_R(env, cas->dataHi);
4689 HReg rDataLo = iselIntExpr_R(env, cas->dataLo);
4690 HReg rExpdHi = iselIntExpr_R(env, cas->expdHi);
4691 HReg rExpdLo = iselIntExpr_R(env, cas->expdLo);
4692 HReg rOldHi = lookupIRTemp(env, cas->oldHi);
4693 HReg rOldLo = lookupIRTemp(env, cas->oldLo);
4694 switch (ty) {
4695 case Ity_I64:
4696 if (!(env->hwcaps & VEX_HWCAPS_AMD64_CX16))
4697 goto unhandled_cas; /* we'd have to generate
4698 cmpxchg16b, but the host
4699 doesn't support that */
4700 sz = 8;
4701 break;
4702 case Ity_I32:
4703 sz = 4;
4704 break;
4705 default:
4706 goto unhandled_cas;
4707 }
4708 addInstr(env, mk_iMOVsd_RR(rExpdHi, rOldHi));
4709 addInstr(env, mk_iMOVsd_RR(rExpdLo, rOldLo));
4710 addInstr(env, mk_iMOVsd_RR(rExpdHi, hregAMD64_RDX()));
4711 addInstr(env, mk_iMOVsd_RR(rExpdLo, hregAMD64_RAX()));
4712 addInstr(env, mk_iMOVsd_RR(rDataHi, hregAMD64_RCX()));
4713 addInstr(env, mk_iMOVsd_RR(rDataLo, hregAMD64_RBX()));
4714 addInstr(env, AMD64Instr_DACAS(am, sz));
4715 addInstr(env, AMD64Instr_CMov64(Acc_NZ, hregAMD64_RDX(), rOldHi));
4716 addInstr(env, AMD64Instr_CMov64(Acc_NZ, hregAMD64_RAX(), rOldLo));
4717 return;
4718 }
4719 unhandled_cas:
4720 break;
4721
4722 /* --------- INSTR MARK --------- */
4723 /* Doesn't generate any executable code ... */
4724 case Ist_IMark:
4725 return;
4726
4727 /* --------- ABI HINT --------- */
4728 /* These have no meaning (denotation in the IR) and so we ignore
4729 them ... if any actually made it this far. */
4730 case Ist_AbiHint:
4731 return;
4732
4733 /* --------- NO-OP --------- */
4734 case Ist_NoOp:
4735 return;
4736
4737 /* --------- EXIT --------- */
4738 case Ist_Exit: {
4739 if (stmt->Ist.Exit.dst->tag != Ico_U64)
4740 vpanic("iselStmt(amd64): Ist_Exit: dst is not a 64-bit value");
4741
4742 AMD64CondCode cc = iselCondCode(env, stmt->Ist.Exit.guard);
4743 AMD64AMode* amRIP = AMD64AMode_IR(stmt->Ist.Exit.offsIP,
4744 hregAMD64_RBP());
4745
4746 /* Case: boring transfer to known address */
4747 if (stmt->Ist.Exit.jk == Ijk_Boring) {
4748 if (env->chainingAllowed) {
4749 /* .. almost always true .. */
4750 /* Skip the event check at the dst if this is a forwards
4751 edge. */
4752 Bool toFastEP
4753 = ((Addr64)stmt->Ist.Exit.dst->Ico.U64) > env->max_ga;
4754 if (0) vex_printf("%s", toFastEP ? "Y" : ",");
4755 addInstr(env, AMD64Instr_XDirect(stmt->Ist.Exit.dst->Ico.U64,
4756 amRIP, cc, toFastEP));
4757 } else {
4758 /* .. very occasionally .. */
4759 /* We can't use chaining, so ask for an assisted transfer,
4760 as that's the only alternative that is allowable. */
4761 HReg r = iselIntExpr_R(env, IRExpr_Const(stmt->Ist.Exit.dst));
4762 addInstr(env, AMD64Instr_XAssisted(r, amRIP, cc, Ijk_Boring));
4763 }
4764 return;
4765 }
4766
4767 /* Case: assisted transfer to arbitrary address */
4768 switch (stmt->Ist.Exit.jk) {
4769 /* Keep this list in sync with that in iselNext below */
4770 case Ijk_ClientReq:
4771 case Ijk_EmWarn:
4772 case Ijk_NoDecode:
4773 case Ijk_NoRedir:
4774 case Ijk_SigSEGV:
4775 case Ijk_SigTRAP:
4776 case Ijk_Sys_syscall:
4777 case Ijk_InvalICache:
4778 case Ijk_Yield:
4779 {
4780 HReg r = iselIntExpr_R(env, IRExpr_Const(stmt->Ist.Exit.dst));
4781 addInstr(env, AMD64Instr_XAssisted(r, amRIP, cc, stmt->Ist.Exit.jk));
4782 return;
4783 }
4784 default:
4785 break;
4786 }
4787
4788 /* Do we ever expect to see any other kind? */
4789 goto stmt_fail;
4790 }
4791
4792 default: break;
4793 }
4794 stmt_fail:
4795 ppIRStmt(stmt);
4796 vpanic("iselStmt(amd64)");
4797 }
4798
4799
4800 /*---------------------------------------------------------*/
4801 /*--- ISEL: Basic block terminators (Nexts) ---*/
4802 /*---------------------------------------------------------*/
4803
iselNext(ISelEnv * env,IRExpr * next,IRJumpKind jk,Int offsIP)4804 static void iselNext ( ISelEnv* env,
4805 IRExpr* next, IRJumpKind jk, Int offsIP )
4806 {
4807 if (vex_traceflags & VEX_TRACE_VCODE) {
4808 vex_printf( "\n-- PUT(%d) = ", offsIP);
4809 ppIRExpr( next );
4810 vex_printf( "; exit-");
4811 ppIRJumpKind(jk);
4812 vex_printf( "\n");
4813 }
4814
4815 /* Case: boring transfer to known address */
4816 if (next->tag == Iex_Const) {
4817 IRConst* cdst = next->Iex.Const.con;
4818 vassert(cdst->tag == Ico_U64);
4819 if (jk == Ijk_Boring || jk == Ijk_Call) {
4820 /* Boring transfer to known address */
4821 AMD64AMode* amRIP = AMD64AMode_IR(offsIP, hregAMD64_RBP());
4822 if (env->chainingAllowed) {
4823 /* .. almost always true .. */
4824 /* Skip the event check at the dst if this is a forwards
4825 edge. */
4826 Bool toFastEP
4827 = ((Addr64)cdst->Ico.U64) > env->max_ga;
4828 if (0) vex_printf("%s", toFastEP ? "X" : ".");
4829 addInstr(env, AMD64Instr_XDirect(cdst->Ico.U64,
4830 amRIP, Acc_ALWAYS,
4831 toFastEP));
4832 } else {
4833 /* .. very occasionally .. */
4834 /* We can't use chaining, so ask for an indirect transfer,
4835 as that's the cheapest alternative that is
4836 allowable. */
4837 HReg r = iselIntExpr_R(env, next);
4838 addInstr(env, AMD64Instr_XAssisted(r, amRIP, Acc_ALWAYS,
4839 Ijk_Boring));
4840 }
4841 return;
4842 }
4843 }
4844
4845 /* Case: call/return (==boring) transfer to any address */
4846 switch (jk) {
4847 case Ijk_Boring: case Ijk_Ret: case Ijk_Call: {
4848 HReg r = iselIntExpr_R(env, next);
4849 AMD64AMode* amRIP = AMD64AMode_IR(offsIP, hregAMD64_RBP());
4850 if (env->chainingAllowed) {
4851 addInstr(env, AMD64Instr_XIndir(r, amRIP, Acc_ALWAYS));
4852 } else {
4853 addInstr(env, AMD64Instr_XAssisted(r, amRIP, Acc_ALWAYS,
4854 Ijk_Boring));
4855 }
4856 return;
4857 }
4858 default:
4859 break;
4860 }
4861
4862 /* Case: assisted transfer to arbitrary address */
4863 switch (jk) {
4864 /* Keep this list in sync with that for Ist_Exit above */
4865 case Ijk_ClientReq:
4866 case Ijk_EmWarn:
4867 case Ijk_NoDecode:
4868 case Ijk_NoRedir:
4869 case Ijk_SigSEGV:
4870 case Ijk_SigTRAP:
4871 case Ijk_Sys_syscall:
4872 case Ijk_InvalICache:
4873 case Ijk_Yield: {
4874 HReg r = iselIntExpr_R(env, next);
4875 AMD64AMode* amRIP = AMD64AMode_IR(offsIP, hregAMD64_RBP());
4876 addInstr(env, AMD64Instr_XAssisted(r, amRIP, Acc_ALWAYS, jk));
4877 return;
4878 }
4879 default:
4880 break;
4881 }
4882
4883 vex_printf( "\n-- PUT(%d) = ", offsIP);
4884 ppIRExpr( next );
4885 vex_printf( "; exit-");
4886 ppIRJumpKind(jk);
4887 vex_printf( "\n");
4888 vassert(0); // are we expecting any other kind?
4889 }
4890
4891
4892 /*---------------------------------------------------------*/
4893 /*--- Insn selector top-level ---*/
4894 /*---------------------------------------------------------*/
4895
4896 /* Translate an entire SB to amd64 code. */
4897
iselSB_AMD64(const IRSB * bb,VexArch arch_host,const VexArchInfo * archinfo_host,const VexAbiInfo * vbi,Int offs_Host_EvC_Counter,Int offs_Host_EvC_FailAddr,Bool chainingAllowed,Bool addProfInc,Addr max_ga)4898 HInstrArray* iselSB_AMD64 ( const IRSB* bb,
4899 VexArch arch_host,
4900 const VexArchInfo* archinfo_host,
4901 const VexAbiInfo* vbi/*UNUSED*/,
4902 Int offs_Host_EvC_Counter,
4903 Int offs_Host_EvC_FailAddr,
4904 Bool chainingAllowed,
4905 Bool addProfInc,
4906 Addr max_ga )
4907 {
4908 Int i, j;
4909 HReg hreg, hregHI;
4910 ISelEnv* env;
4911 UInt hwcaps_host = archinfo_host->hwcaps;
4912 AMD64AMode *amCounter, *amFailAddr;
4913
4914 /* sanity ... */
4915 vassert(arch_host == VexArchAMD64);
4916 vassert(0 == (hwcaps_host
4917 & ~(VEX_HWCAPS_AMD64_SSE3
4918 | VEX_HWCAPS_AMD64_CX16
4919 | VEX_HWCAPS_AMD64_LZCNT
4920 | VEX_HWCAPS_AMD64_AVX
4921 | VEX_HWCAPS_AMD64_RDTSCP
4922 | VEX_HWCAPS_AMD64_BMI
4923 | VEX_HWCAPS_AMD64_AVX2)));
4924
4925 /* Check that the host's endianness is as expected. */
4926 vassert(archinfo_host->endness == VexEndnessLE);
4927
4928 /* Make up an initial environment to use. */
4929 env = LibVEX_Alloc_inline(sizeof(ISelEnv));
4930 env->vreg_ctr = 0;
4931
4932 /* Set up output code array. */
4933 env->code = newHInstrArray();
4934
4935 /* Copy BB's type env. */
4936 env->type_env = bb->tyenv;
4937
4938 /* Make up an IRTemp -> virtual HReg mapping. This doesn't
4939 change as we go along. */
4940 env->n_vregmap = bb->tyenv->types_used;
4941 env->vregmap = LibVEX_Alloc_inline(env->n_vregmap * sizeof(HReg));
4942 env->vregmapHI = LibVEX_Alloc_inline(env->n_vregmap * sizeof(HReg));
4943
4944 /* and finally ... */
4945 env->chainingAllowed = chainingAllowed;
4946 env->hwcaps = hwcaps_host;
4947 env->max_ga = max_ga;
4948
4949 /* For each IR temporary, allocate a suitably-kinded virtual
4950 register. */
4951 j = 0;
4952 for (i = 0; i < env->n_vregmap; i++) {
4953 hregHI = hreg = INVALID_HREG;
4954 switch (bb->tyenv->types[i]) {
4955 case Ity_I1:
4956 case Ity_I8: case Ity_I16: case Ity_I32: case Ity_I64:
4957 hreg = mkHReg(True, HRcInt64, 0, j++);
4958 break;
4959 case Ity_I128:
4960 hreg = mkHReg(True, HRcInt64, 0, j++);
4961 hregHI = mkHReg(True, HRcInt64, 0, j++);
4962 break;
4963 case Ity_F32:
4964 case Ity_F64:
4965 case Ity_V128:
4966 hreg = mkHReg(True, HRcVec128, 0, j++);
4967 break;
4968 case Ity_V256:
4969 hreg = mkHReg(True, HRcVec128, 0, j++);
4970 hregHI = mkHReg(True, HRcVec128, 0, j++);
4971 break;
4972 default:
4973 ppIRType(bb->tyenv->types[i]);
4974 vpanic("iselBB(amd64): IRTemp type");
4975 }
4976 env->vregmap[i] = hreg;
4977 env->vregmapHI[i] = hregHI;
4978 }
4979 env->vreg_ctr = j;
4980
4981 /* The very first instruction must be an event check. */
4982 amCounter = AMD64AMode_IR(offs_Host_EvC_Counter, hregAMD64_RBP());
4983 amFailAddr = AMD64AMode_IR(offs_Host_EvC_FailAddr, hregAMD64_RBP());
4984 addInstr(env, AMD64Instr_EvCheck(amCounter, amFailAddr));
4985
4986 /* Possibly a block counter increment (for profiling). At this
4987 point we don't know the address of the counter, so just pretend
4988 it is zero. It will have to be patched later, but before this
4989 translation is used, by a call to LibVEX_patchProfCtr. */
4990 if (addProfInc) {
4991 addInstr(env, AMD64Instr_ProfInc());
4992 }
4993
4994 /* Ok, finally we can iterate over the statements. */
4995 for (i = 0; i < bb->stmts_used; i++)
4996 if (bb->stmts[i])
4997 iselStmt(env, bb->stmts[i]);
4998
4999 iselNext(env, bb->next, bb->jumpkind, bb->offsIP);
5000
5001 /* record the number of vregs we used. */
5002 env->code->n_vregs = env->vreg_ctr;
5003 return env->code;
5004 }
5005
5006
5007 /*---------------------------------------------------------------*/
5008 /*--- end host_amd64_isel.c ---*/
5009 /*---------------------------------------------------------------*/
5010