1 
2 /*---------------------------------------------------------------*/
3 /*--- begin                                 host_reg_alloc2.c ---*/
4 /*---------------------------------------------------------------*/
5 
6 /*
7    This file is part of Valgrind, a dynamic binary instrumentation
8    framework.
9 
10    Copyright (C) 2004-2015 OpenWorks LLP
11       info@open-works.net
12 
13    This program is free software; you can redistribute it and/or
14    modify it under the terms of the GNU General Public License as
15    published by the Free Software Foundation; either version 2 of the
16    License, or (at your option) any later version.
17 
18    This program is distributed in the hope that it will be useful, but
19    WITHOUT ANY WARRANTY; without even the implied warranty of
20    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
21    General Public License for more details.
22 
23    You should have received a copy of the GNU General Public License
24    along with this program; if not, write to the Free Software
25    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
26    02110-1301, USA.
27 
28    The GNU General Public License is contained in the file COPYING.
29 
30    Neither the names of the U.S. Department of Energy nor the
31    University of California nor the names of its contributors may be
32    used to endorse or promote products derived from this software
33    without prior written permission.
34 */
35 
36 #include "libvex_basictypes.h"
37 #include "libvex.h"
38 
39 #include "main_util.h"
40 #include "host_generic_regs.h"
41 
42 /* Set to 1 for lots of debugging output. */
43 #define DEBUG_REGALLOC 0
44 
45 
46 /* TODO 27 Oct 04:
47 
48    Better consistency checking from what isMove tells us.
49 
50    We can possibly do V-V coalescing even when the src is spilled,
51    providing we can arrange for the dst to have the same spill slot.
52 
53    Note that state[].hreg is the same as the available real regs.
54 
55    Generally rationalise data structures.  */
56 
57 
58 /* Records information on virtual register live ranges.  Computed once
59    and remains unchanged after that. */
60 typedef
61    struct {
62       /* Becomes live for the first time after this insn ... */
63       Short live_after;
64       /* Becomes dead for the last time before this insn ... */
65       Short dead_before;
66       /* The "home" spill slot, if needed.  Never changes. */
67       Short spill_offset;
68       Short spill_size;
69       /* What kind of register this is. */
70       HRegClass reg_class;
71    }
72    VRegLR;
73 
74 
75 /* Records information on real-register live ranges.  Computed once
76    and remains unchanged after that. */
77 typedef
78    struct {
79       HReg rreg;
80       /* Becomes live after this insn ... */
81       Short live_after;
82       /* Becomes dead before this insn ... */
83       Short dead_before;
84    }
85    RRegLR;
86 
87 
88 /* An array of the following structs (rreg_state) comprises the
89    running state of the allocator.  It indicates what the current
90    disposition of each allocatable real register is.  The array gets
91    updated as the allocator processes instructions.  The identity of
92    the register is not recorded here, because the index of this
93    structure in doRegisterAllocation()'s |rreg_state| is the index
94    number of the register, and the register itself can be extracted
95    from the RRegUniverse supplied to doRegisterAllocation(). */
96 typedef
97    struct {
98       /* ------ FIELDS WHICH DO NOT CHANGE ------ */
99       /* Is this involved in any HLRs?  (only an optimisation hint) */
100       Bool has_hlrs;
101       /* ------ FIELDS WHICH DO CHANGE ------ */
102       /* 6 May 07: rearranged fields below so the whole struct fits
103          into 16 bytes on both x86 and amd64. */
104       /* Used when .disp == Bound and we are looking for vregs to
105          spill. */
106       Bool is_spill_cand;
107       /* Optimisation: used when .disp == Bound.  Indicates when the
108          rreg has the same value as the spill slot for the associated
109          vreg.  Is safely left at False, and becomes True after a
110          spill store or reload for this rreg. */
111       Bool eq_spill_slot;
112       /* What's it's current disposition? */
113       enum { Free,     /* available for use */
114              Unavail,  /* in a real-reg live range */
115              Bound     /* in use (holding value of some vreg) */
116            }
117            disp;
118       /* If .disp == Bound, what vreg is it bound to? */
119       HReg vreg;
120    }
121    RRegState;
122 
123 
124 /* The allocator also maintains a redundant array of indexes
125    (vreg_state) from vreg numbers back to entries in rreg_state.  It
126    is redundant because iff vreg_state[i] == j then
127    hregNumber(rreg_state[j].vreg) == i -- that is, the two entries
128    point at each other.  The purpose of this is to speed up activities
129    which involve looking for a particular vreg: there is no need to
130    scan the rreg_state looking for it, just index directly into
131    vreg_state.  The FAQ "does this vreg already have an associated
132    rreg" is the main beneficiary.
133 
134    To indicate, in vreg_state[i], that a given vreg is not currently
135    associated with any rreg, that entry can be set to INVALID_RREG_NO.
136 
137    Because the vreg_state entries are signed Shorts, the max number
138    of vregs that can be handed by regalloc is 32767.
139 */
140 
141 #define INVALID_RREG_NO ((Short)(-1))
142 
143 #define IS_VALID_VREGNO(_zz) ((_zz) >= 0 && (_zz) < n_vregs)
144 #define IS_VALID_RREGNO(_zz) ((_zz) >= 0 && (_zz) < n_rregs)
145 
146 
147 /* Search forward from some given point in the incoming instruction
148    sequence.  Point is to select a virtual register to spill, by
149    finding the vreg which is mentioned as far ahead as possible, in
150    the hope that this will minimise the number of consequent reloads.
151 
152    Only do the search for vregs which are Bound in the running state,
153    and for which the .is_spill_cand field is set.  This allows the
154    caller to arbitrarily restrict the set of spill candidates to be
155    considered.
156 
157    To do this we don't actually need to see the incoming instruction
158    stream.  Rather, what we need us the HRegUsage records for the
159    incoming instruction stream.  Hence that is passed in.
160 
161    Returns an index into the state array indicating the (v,r) pair to
162    spill, or -1 if none was found.  */
163 static
findMostDistantlyMentionedVReg(HRegUsage * reg_usages_in,Int search_from_instr,Int num_instrs,RRegState * state,Int n_state)164 Int findMostDistantlyMentionedVReg (
165    HRegUsage*   reg_usages_in,
166    Int          search_from_instr,
167    Int          num_instrs,
168    RRegState*   state,
169    Int          n_state
170 )
171 {
172    Int k, m;
173    Int furthest_k = -1;
174    Int furthest   = -1;
175    vassert(search_from_instr >= 0);
176    for (k = 0; k < n_state; k++) {
177       if (!state[k].is_spill_cand)
178          continue;
179       vassert(state[k].disp == Bound);
180       for (m = search_from_instr; m < num_instrs; m++) {
181          if (HRegUsage__contains(&reg_usages_in[m], state[k].vreg))
182             break;
183       }
184       if (m > furthest) {
185          furthest   = m;
186          furthest_k = k;
187       }
188    }
189    return furthest_k;
190 }
191 
192 
193 /* Check that this vreg has been assigned a sane spill offset. */
194 inline
sanity_check_spill_offset(VRegLR * vreg)195 static void sanity_check_spill_offset ( VRegLR* vreg )
196 {
197    switch (vreg->reg_class) {
198       case HRcVec128: case HRcFlt64:
199          vassert(0 == ((UShort)vreg->spill_offset % 16)); break;
200       default:
201          vassert(0 == ((UShort)vreg->spill_offset % 8)); break;
202    }
203 }
204 
205 
206 /* Double the size of the real-reg live-range array, if needed. */
207 __attribute__((noinline))
ensureRRLRspace_SLOW(RRegLR ** info,Int * size,Int used)208 static void ensureRRLRspace_SLOW ( RRegLR** info, Int* size, Int used )
209 {
210    Int     k;
211    RRegLR* arr2;
212    if (0)
213       vex_printf("ensureRRISpace: %d -> %d\n", *size, 2 * *size);
214    vassert(used == *size);
215    arr2 = LibVEX_Alloc_inline(2 * *size * sizeof(RRegLR));
216    for (k = 0; k < *size; k++)
217       arr2[k] = (*info)[k];
218    *size *= 2;
219    *info = arr2;
220 }
221 inline
ensureRRLRspace(RRegLR ** info,Int * size,Int used)222 static void ensureRRLRspace ( RRegLR** info, Int* size, Int used )
223 {
224    if (LIKELY(used < *size)) return;
225    ensureRRLRspace_SLOW(info, size, used);
226 }
227 
228 
229 /* Sort an array of RRegLR entries by either the .live_after or
230    .dead_before fields.  This is performance-critical. */
sortRRLRarray(RRegLR * arr,Int size,Bool by_live_after)231 static void sortRRLRarray ( RRegLR* arr,
232                             Int size, Bool by_live_after )
233 {
234    Int    incs[14] = { 1, 4, 13, 40, 121, 364, 1093, 3280,
235                        9841, 29524, 88573, 265720,
236                        797161, 2391484 };
237    Int    lo = 0;
238    Int    hi = size-1;
239    Int    i, j, h, bigN, hp;
240    RRegLR v;
241 
242    vassert(size >= 0);
243    if (size == 0)
244       return;
245 
246    bigN = hi - lo + 1; if (bigN < 2) return;
247    hp = 0; while (hp < 14 && incs[hp] < bigN) hp++; hp--;
248 
249    if (by_live_after) {
250 
251       for ( ; hp >= 0; hp--) {
252          h = incs[hp];
253          for (i = lo + h; i <= hi; i++) {
254             v = arr[i];
255             j = i;
256             while (arr[j-h].live_after > v.live_after) {
257                arr[j] = arr[j-h];
258                j = j - h;
259                if (j <= (lo + h - 1)) break;
260             }
261             arr[j] = v;
262          }
263       }
264 
265    } else {
266 
267       for ( ; hp >= 0; hp--) {
268          h = incs[hp];
269          for (i = lo + h; i <= hi; i++) {
270             v = arr[i];
271             j = i;
272             while (arr[j-h].dead_before > v.dead_before) {
273                arr[j] = arr[j-h];
274                j = j - h;
275                if (j <= (lo + h - 1)) break;
276             }
277             arr[j] = v;
278          }
279       }
280 
281    }
282 }
283 
284 
285 /* Compute the index of the highest and lowest 1 in a ULong,
286    respectively.  Results are undefined if the argument is zero.
287    Don't pass it zero :) */
ULong__maxIndex(ULong w64)288 static inline UInt ULong__maxIndex ( ULong w64 ) {
289    return 63 - __builtin_clzll(w64);
290 }
291 
ULong__minIndex(ULong w64)292 static inline UInt ULong__minIndex ( ULong w64 ) {
293    return __builtin_ctzll(w64);
294 }
295 
296 
297 /* Vectorised memset, copied from Valgrind's m_libcbase.c. */
local_memset(void * destV,Int c,SizeT sz)298 static void* local_memset ( void *destV, Int c, SizeT sz )
299 {
300 #  define IS_4_ALIGNED(aaa_p) (0 == (((HWord)(aaa_p)) & ((HWord)0x3)))
301 
302    UInt   c4;
303    UChar* d = destV;
304    UChar  uc = c;
305 
306    while ((!IS_4_ALIGNED(d)) && sz >= 1) {
307       d[0] = uc;
308       d++;
309       sz--;
310    }
311    if (sz == 0)
312       return destV;
313    c4 = uc;
314    c4 |= (c4 << 8);
315    c4 |= (c4 << 16);
316    while (sz >= 16) {
317       ((UInt*)d)[0] = c4;
318       ((UInt*)d)[1] = c4;
319       ((UInt*)d)[2] = c4;
320       ((UInt*)d)[3] = c4;
321       d += 16;
322       sz -= 16;
323    }
324    while (sz >= 4) {
325       ((UInt*)d)[0] = c4;
326       d += 4;
327       sz -= 4;
328    }
329    while (sz >= 1) {
330       d[0] = c;
331       d++;
332       sz--;
333    }
334    return destV;
335 
336 #  undef IS_4_ALIGNED
337 }
338 
339 
340 /* A target-independent register allocator.  Requires various
341    functions which it uses to deal abstractly with instructions and
342    registers, since it cannot have any target-specific knowledge.
343 
344    Returns a new list of instructions, which, as a result of the
345    behaviour of mapRegs, will be in-place modifications of the
346    original instructions.
347 
348    Requires that the incoming code has been generated using
349    vreg numbers 0, 1 .. n_vregs-1.  Appearance of a vreg outside
350    that range is a checked run-time error.
351 
352    Takes an expandable array of pointers to unallocated insns.
353    Returns an expandable array of pointers to allocated insns.
354 */
doRegisterAllocation(HInstrArray * instrs_in,const RRegUniverse * univ,Bool (* isMove)(const HInstr *,HReg *,HReg *),void (* getRegUsage)(HRegUsage *,const HInstr *,Bool),void (* mapRegs)(HRegRemap *,HInstr *,Bool),void (* genSpill)(HInstr **,HInstr **,HReg,Int,Bool),void (* genReload)(HInstr **,HInstr **,HReg,Int,Bool),HInstr * (* directReload)(HInstr *,HReg,Short),Int guest_sizeB,void (* ppInstr)(const HInstr *,Bool),void (* ppReg)(HReg),Bool mode64)355 HInstrArray* doRegisterAllocation (
356 
357    /* Incoming virtual-registerised code. */
358    HInstrArray* instrs_in,
359 
360    /* The real-register universe to use.  This contains facts about
361       real registers, one of which is the set of registers available
362       for allocation. */
363    const RRegUniverse* univ,
364 
365    /* Return True iff the given insn is a reg-reg move, in which
366       case also return the src and dst regs. */
367    Bool (*isMove) ( const HInstr*, HReg*, HReg* ),
368 
369    /* Get info about register usage in this insn. */
370    void (*getRegUsage) ( HRegUsage*, const HInstr*, Bool ),
371 
372    /* Apply a reg-reg mapping to an insn. */
373    void (*mapRegs) ( HRegRemap*, HInstr*, Bool ),
374 
375    /* Return one, or, if we're unlucky, two insn(s) to spill/restore a
376       real reg to a spill slot byte offset.  The two leading HInstr**
377       args are out parameters, through which the generated insns are
378       returned.  Also (optionally) a 'directReload' function, which
379       attempts to replace a given instruction by one which reads
380       directly from a specified spill slot.  May be NULL, in which
381       case the optimisation is not attempted. */
382    void    (*genSpill)  ( HInstr**, HInstr**, HReg, Int, Bool ),
383    void    (*genReload) ( HInstr**, HInstr**, HReg, Int, Bool ),
384    HInstr* (*directReload) ( HInstr*, HReg, Short ),
385    Int     guest_sizeB,
386 
387    /* For debug printing only. */
388    void (*ppInstr) ( const HInstr*, Bool ),
389    void (*ppReg) ( HReg ),
390 
391    /* 32/64bit mode */
392    Bool mode64
393 )
394 {
395 #  define N_SPILL64S  (LibVEX_N_SPILL_BYTES / 8)
396 
397    const Bool eq_spill_opt = True;
398 
399    /* Info on vregs and rregs.  Computed once and remains
400       unchanged. */
401    Int     n_vregs;
402    VRegLR* vreg_lrs; /* [0 .. n_vregs-1] */
403 
404    /* We keep two copies of the real-reg live range info, one sorted
405       by .live_after and the other by .dead_before.  First the
406       unsorted info is created in the _la variant is copied into the
407       _db variant.  Once that's done both of them are sorted.
408       We also need two integer cursors which record the next
409       location in the two arrays to consider. */
410    RRegLR* rreg_lrs_la;
411    RRegLR* rreg_lrs_db;
412    Int     rreg_lrs_size;
413    Int     rreg_lrs_used;
414    Int     rreg_lrs_la_next;
415    Int     rreg_lrs_db_next;
416 
417    /* Info on register usage in the incoming instruction array.
418       Computed once and remains unchanged, more or less; updated
419       sometimes by the direct-reload optimisation. */
420    HRegUsage* reg_usage_arr; /* [0 .. instrs_in->arr_used-1] */
421 
422    /* Used when constructing vreg_lrs (for allocating stack
423       slots). */
424    Short ss_busy_until_before[N_SPILL64S];
425 
426    /* Used when constructing rreg_lrs. */
427    Int* rreg_live_after;
428    Int* rreg_dead_before;
429 
430    /* Running state of the core allocation algorithm. */
431    RRegState* rreg_state;  /* [0 .. n_rregs-1] */
432    Int        n_rregs;
433 
434    /* .. and the redundant backward map */
435    /* Each value is 0 .. n_rregs-1 or is INVALID_RREG_NO.
436       This inplies n_rregs must be <= 32768. */
437    Short*     vreg_state;  /* [0 .. n_vregs-1] */
438 
439    /* The vreg -> rreg map constructed and then applied to each
440       instr. */
441    HRegRemap remap;
442 
443    /* The output array of instructions. */
444    HInstrArray* instrs_out;
445 
446    /* Sanity checks are expensive.  They are only done periodically,
447       not at each insn processed. */
448    Bool do_sanity_check;
449 
450    vassert(0 == (guest_sizeB % LibVEX_GUEST_STATE_ALIGN));
451    vassert(0 == (LibVEX_N_SPILL_BYTES % LibVEX_GUEST_STATE_ALIGN));
452    vassert(0 == (N_SPILL64S % 2));
453 
454    /* The live range numbers are signed shorts, and so limiting the
455       number of insns to 15000 comfortably guards against them
456       overflowing 32k. */
457    vassert(instrs_in->arr_used <= 15000);
458 
459 #  define INVALID_INSTRNO (-2)
460 
461 #  define EMIT_INSTR(_instr)                  \
462       do {                                    \
463         HInstr* _tmp = (_instr);              \
464         if (DEBUG_REGALLOC) {                 \
465            vex_printf("**  ");                \
466            (*ppInstr)(_tmp, mode64);          \
467            vex_printf("\n\n");                \
468         }                                     \
469         addHInstr ( instrs_out, _tmp );       \
470       } while (0)
471 
472 #   define PRINT_STATE						   \
473       do {							   \
474          Int z, q;						   \
475          for (z = 0; z < n_rregs; z++) {			   \
476             vex_printf("  rreg_state[%2d] = ", z);		   \
477             (*ppReg)(univ->regs[z]);	       			   \
478             vex_printf("  \t");					   \
479             switch (rreg_state[z].disp) {			   \
480                case Free:    vex_printf("Free\n"); break;	   \
481                case Unavail: vex_printf("Unavail\n"); break;	   \
482                case Bound:   vex_printf("BoundTo "); 		   \
483                              (*ppReg)(rreg_state[z].vreg);	   \
484                              vex_printf("\n"); break;		   \
485             }							   \
486          }							   \
487          vex_printf("\n  vreg_state[0 .. %d]:\n    ", n_vregs-1);  \
488          q = 0;                                                    \
489          for (z = 0; z < n_vregs; z++) {                           \
490             if (vreg_state[z] == INVALID_RREG_NO)                  \
491                continue;                                           \
492             vex_printf("[%d] -> %d   ", z, vreg_state[z]);         \
493             q++;                                                   \
494             if (q > 0 && (q % 6) == 0)                             \
495                vex_printf("\n    ");                               \
496          }                                                         \
497          vex_printf("\n");                                         \
498       } while (0)
499 
500 
501    /* --------- Stage 0: set up output array --------- */
502    /* --------- and allocate/initialise running state. --------- */
503 
504    instrs_out = newHInstrArray();
505 
506    /* ... and initialise running state. */
507    /* n_rregs is no more than a short name for n_available_real_regs. */
508    n_rregs = univ->allocable;
509    n_vregs = instrs_in->n_vregs;
510 
511    /* If this is not so, vreg_state entries will overflow. */
512    vassert(n_vregs < 32767);
513 
514    /* If this is not so, the universe we have is nonsensical. */
515    vassert(n_rregs > 0);
516 
517    rreg_state = LibVEX_Alloc_inline(n_rregs * sizeof(RRegState));
518    vreg_state = LibVEX_Alloc_inline(n_vregs * sizeof(Short));
519 
520    for (Int j = 0; j < n_rregs; j++) {
521       rreg_state[j].has_hlrs      = False;
522       rreg_state[j].disp          = Free;
523       rreg_state[j].vreg          = INVALID_HREG;
524       rreg_state[j].is_spill_cand = False;
525       rreg_state[j].eq_spill_slot = False;
526    }
527 
528    for (Int j = 0; j < n_vregs; j++)
529       vreg_state[j] = INVALID_RREG_NO;
530 
531 
532    /* --------- Stage 1: compute vreg live ranges. --------- */
533    /* --------- Stage 2: compute rreg live ranges. --------- */
534 
535    /* ------ start of SET UP TO COMPUTE VREG LIVE RANGES ------ */
536 
537    /* This is relatively simple, because (1) we only seek the complete
538       end-to-end live range of each vreg, and are not interested in
539       any holes in it, and (2) the vregs are conveniently numbered 0
540       .. n_vregs-1, so we can just dump the results in a
541       pre-allocated array. */
542 
543    vreg_lrs = NULL;
544    if (n_vregs > 0)
545       vreg_lrs = LibVEX_Alloc_inline(sizeof(VRegLR) * n_vregs);
546 
547    for (Int j = 0; j < n_vregs; j++) {
548       vreg_lrs[j].live_after     = INVALID_INSTRNO;
549       vreg_lrs[j].dead_before    = INVALID_INSTRNO;
550       vreg_lrs[j].spill_offset   = 0;
551       vreg_lrs[j].spill_size     = 0;
552       vreg_lrs[j].reg_class      = HRcINVALID;
553    }
554 
555    /* An array to hold the reg-usage info for the incoming
556       instructions. */
557    reg_usage_arr
558       = LibVEX_Alloc_inline(sizeof(HRegUsage) * instrs_in->arr_used-1);
559 
560    /* ------ end of SET UP TO COMPUTE VREG LIVE RANGES ------ */
561 
562    /* ------ start of SET UP TO COMPUTE RREG LIVE RANGES ------ */
563 
564    /* This is more complex than Stage 1, because we need to compute
565       exactly all the live ranges of all the allocatable real regs,
566       and we don't know in advance how many there will be. */
567 
568    rreg_lrs_used = 0;
569    rreg_lrs_size = 4;
570    rreg_lrs_la = LibVEX_Alloc_inline(rreg_lrs_size * sizeof(RRegLR));
571    rreg_lrs_db = NULL; /* we'll create this later */
572 
573    /* We'll need to track live range start/end points seperately for
574       each rreg.  Sigh. */
575    vassert(n_rregs > 0);
576    rreg_live_after  = LibVEX_Alloc_inline(n_rregs * sizeof(Int));
577    rreg_dead_before = LibVEX_Alloc_inline(n_rregs * sizeof(Int));
578 
579    for (Int j = 0; j < n_rregs; j++) {
580       rreg_live_after[j] =
581       rreg_dead_before[j] = INVALID_INSTRNO;
582    }
583 
584    /* ------ end of SET UP TO COMPUTE RREG LIVE RANGES ------ */
585 
586    /* ------ start of ITERATE OVER INSNS ------ */
587 
588    for (Int ii = 0; ii < instrs_in->arr_used; ii++) {
589 
590       (*getRegUsage)( &reg_usage_arr[ii], instrs_in->arr[ii], mode64 );
591 
592       if (0) {
593          vex_printf("\n%d  stage1: ", ii);
594          (*ppInstr)(instrs_in->arr[ii], mode64);
595          vex_printf("\n");
596          ppHRegUsage(univ, &reg_usage_arr[ii]);
597       }
598 
599       /* ------ start of DEAL WITH VREG LIVE RANGES ------ */
600 
601       /* for each virtual reg mentioned in the insn ... */
602       for (Int j = 0; j < reg_usage_arr[ii].n_vRegs; j++) {
603 
604          HReg vreg = reg_usage_arr[ii].vRegs[j];
605          vassert(hregIsVirtual(vreg));
606 
607          Int k = hregIndex(vreg);
608          if (k < 0 || k >= n_vregs) {
609             vex_printf("\n");
610             (*ppInstr)(instrs_in->arr[ii], mode64);
611             vex_printf("\n");
612             vex_printf("vreg %d, n_vregs %d\n", k, n_vregs);
613             vpanic("doRegisterAllocation: out-of-range vreg");
614          }
615 
616          /* Take the opportunity to note its regclass.  We'll need
617             that when allocating spill slots. */
618          if (vreg_lrs[k].reg_class == HRcINVALID) {
619             /* First mention of this vreg. */
620             vreg_lrs[k].reg_class = hregClass(vreg);
621          } else {
622             /* Seen it before, so check for consistency. */
623             vassert(vreg_lrs[k].reg_class == hregClass(vreg));
624          }
625 
626          /* Now consider live ranges. */
627          switch (reg_usage_arr[ii].vMode[j]) {
628             case HRmRead:
629                if (vreg_lrs[k].live_after == INVALID_INSTRNO) {
630                   vex_printf("\n\nOFFENDING VREG = %d\n", k);
631                   vpanic("doRegisterAllocation: "
632                          "first event for vreg is Read");
633                }
634                vreg_lrs[k].dead_before = toShort(ii + 1);
635                break;
636             case HRmWrite:
637                if (vreg_lrs[k].live_after == INVALID_INSTRNO)
638                   vreg_lrs[k].live_after = toShort(ii);
639                vreg_lrs[k].dead_before = toShort(ii + 1);
640                break;
641             case HRmModify:
642                if (vreg_lrs[k].live_after == INVALID_INSTRNO) {
643                   vex_printf("\n\nOFFENDING VREG = %d\n", k);
644                   vpanic("doRegisterAllocation: "
645                          "first event for vreg is Modify");
646                }
647                vreg_lrs[k].dead_before = toShort(ii + 1);
648                break;
649             default:
650                vpanic("doRegisterAllocation(1)");
651          } /* switch */
652 
653       } /* iterate over virtual registers */
654 
655       /* ------ end of DEAL WITH VREG LIVE RANGES ------ */
656 
657       /* ------ start of DEAL WITH RREG LIVE RANGES ------ */
658 
659       /* If this doesn't hold, the following iteration over real registers
660          will fail miserably. */
661       vassert(N_RREGUNIVERSE_REGS == 64);
662 
663       const ULong rRead      = reg_usage_arr[ii].rRead;
664       const ULong rWritten   = reg_usage_arr[ii].rWritten;
665       const ULong rMentioned = rRead | rWritten;
666 
667       UInt rReg_minIndex;
668       UInt rReg_maxIndex;
669       if (rMentioned == 0) {
670          /* There are no real register uses in this insn.  Set
671             rReg_{min,max}Index so that the following loop doesn't iterate
672             at all, so as to avoid wasting time. */
673          rReg_minIndex = 1;
674          rReg_maxIndex = 0;
675       } else {
676          rReg_minIndex = ULong__minIndex(rMentioned);
677          rReg_maxIndex = ULong__maxIndex(rMentioned);
678          /* Don't bother to look at registers which are not available
679             to the allocator.  We asserted above that n_rregs > 0, so
680             n_rregs-1 is safe. */
681          if (rReg_maxIndex >= n_rregs)
682             rReg_maxIndex = n_rregs-1;
683       }
684 
685       /* for each allocator-available real reg mentioned in the insn ... */
686       /* Note.  We are allocating only over the real regs available to
687          the allocator.  Others, eg the stack or baseblock pointers,
688          are unavailable to allocation and so we never visit them.
689          Hence the iteration is cut off at n_rregs-1, since n_rregs ==
690          univ->allocable. */
691       for (Int j = rReg_minIndex; j <= rReg_maxIndex; j++) {
692 
693          const ULong jMask = 1ULL << j;
694          if (LIKELY((rMentioned & jMask) == 0))
695             continue;
696 
697          const Bool isR = (rRead    & jMask) != 0;
698          const Bool isW = (rWritten & jMask) != 0;
699 
700          /* Dummy initialisations of flush_la and flush_db to avoid
701             possible bogus uninit-var warnings from gcc. */
702          Int  flush_la = INVALID_INSTRNO, flush_db = INVALID_INSTRNO;
703          Bool flush = False;
704 
705          if (isW && !isR) {
706             flush_la = rreg_live_after[j];
707             flush_db = rreg_dead_before[j];
708             if (flush_la != INVALID_INSTRNO && flush_db != INVALID_INSTRNO)
709                flush = True;
710             rreg_live_after[j]  = ii;
711             rreg_dead_before[j] = ii+1;
712          } else if (!isW && isR) {
713             if (rreg_live_after[j] == INVALID_INSTRNO) {
714                vex_printf("\nOFFENDING RREG = ");
715                (*ppReg)(univ->regs[j]);
716                vex_printf("\n");
717                vex_printf("\nOFFENDING instr = ");
718                (*ppInstr)(instrs_in->arr[ii], mode64);
719                vex_printf("\n");
720                vpanic("doRegisterAllocation: "
721                       "first event for rreg is Read");
722             }
723             rreg_dead_before[j] = ii+1;
724          } else {
725             vassert(isR && isW);
726             if (rreg_live_after[j] == INVALID_INSTRNO) {
727                vex_printf("\nOFFENDING RREG = ");
728                (*ppReg)(univ->regs[j]);
729                vex_printf("\n");
730                vex_printf("\nOFFENDING instr = ");
731                (*ppInstr)(instrs_in->arr[ii], mode64);
732                vex_printf("\n");
733                vpanic("doRegisterAllocation: "
734                       "first event for rreg is Modify");
735             }
736             rreg_dead_before[j] = ii+1;
737          }
738 
739          if (flush) {
740             vassert(flush_la != INVALID_INSTRNO);
741             vassert(flush_db != INVALID_INSTRNO);
742             ensureRRLRspace(&rreg_lrs_la, &rreg_lrs_size, rreg_lrs_used);
743             if (0)
744                vex_printf("FLUSH 1 (%d,%d)\n", flush_la, flush_db);
745             rreg_lrs_la[rreg_lrs_used].rreg        = univ->regs[j];
746             rreg_lrs_la[rreg_lrs_used].live_after  = toShort(flush_la);
747             rreg_lrs_la[rreg_lrs_used].dead_before = toShort(flush_db);
748             rreg_lrs_used++;
749          }
750 
751       } /* iterate over rregs in the instr */
752 
753       /* ------ end of DEAL WITH RREG LIVE RANGES ------ */
754 
755    } /* iterate over insns */
756 
757    /* ------ end of ITERATE OVER INSNS ------ */
758 
759    /* ------ start of FINALISE RREG LIVE RANGES ------ */
760 
761    /* Now finish up any live ranges left over. */
762    for (Int j = 0; j < n_rregs; j++) {
763 
764       if (0) {
765          vex_printf("residual %d:  %d %d\n", j, rreg_live_after[j],
766                                                 rreg_dead_before[j]);
767       }
768       vassert( (rreg_live_after[j] == INVALID_INSTRNO
769                 && rreg_dead_before[j] == INVALID_INSTRNO)
770               ||
771                (rreg_live_after[j] != INVALID_INSTRNO
772                 && rreg_dead_before[j] != INVALID_INSTRNO)
773             );
774 
775       if (rreg_live_after[j] == INVALID_INSTRNO)
776          continue;
777 
778       ensureRRLRspace(&rreg_lrs_la, &rreg_lrs_size, rreg_lrs_used);
779       if (0)
780          vex_printf("FLUSH 2 (%d,%d)\n",
781                     rreg_live_after[j], rreg_dead_before[j]);
782       rreg_lrs_la[rreg_lrs_used].rreg        = univ->regs[j];
783       rreg_lrs_la[rreg_lrs_used].live_after  = toShort(rreg_live_after[j]);
784       rreg_lrs_la[rreg_lrs_used].dead_before = toShort(rreg_dead_before[j]);
785       rreg_lrs_used++;
786    }
787 
788    /* Compute summary hints for choosing real regs.  If a real reg is
789       involved in a hard live range, record that fact in the fixed
790       part of the running rreg_state.  Later, when offered a choice between
791       rregs, it's better to choose one which is not marked as having
792       any HLRs, since ones with HLRs may need to be spilled around
793       their HLRs.  Correctness of final assignment is unaffected by
794       this mechanism -- it is only an optimisation. */
795 
796    for (Int j = 0; j < rreg_lrs_used; j++) {
797       HReg rreg = rreg_lrs_la[j].rreg;
798       vassert(!hregIsVirtual(rreg));
799       /* rreg is involved in a HLR.  Record this info in the array, if
800          there is space. */
801       UInt ix = hregIndex(rreg);
802       vassert(ix < n_rregs);
803       rreg_state[ix].has_hlrs = True;
804    }
805    if (0) {
806       for (Int j = 0; j < n_rregs; j++) {
807          if (!rreg_state[j].has_hlrs)
808             continue;
809          ppReg(univ->regs[j]);
810          vex_printf(" hinted\n");
811       }
812    }
813 
814    /* Finally, copy the _la variant into the _db variant and
815       sort both by their respective fields. */
816    rreg_lrs_db = LibVEX_Alloc_inline(rreg_lrs_used * sizeof(RRegLR));
817    for (Int j = 0; j < rreg_lrs_used; j++)
818       rreg_lrs_db[j] = rreg_lrs_la[j];
819 
820    sortRRLRarray( rreg_lrs_la, rreg_lrs_used, True /* by .live_after*/  );
821    sortRRLRarray( rreg_lrs_db, rreg_lrs_used, False/* by .dead_before*/ );
822 
823    /* And set up the cursors. */
824    rreg_lrs_la_next = 0;
825    rreg_lrs_db_next = 0;
826 
827    for (Int j = 1; j < rreg_lrs_used; j++) {
828       vassert(rreg_lrs_la[j-1].live_after  <= rreg_lrs_la[j].live_after);
829       vassert(rreg_lrs_db[j-1].dead_before <= rreg_lrs_db[j].dead_before);
830    }
831 
832    /* ------ end of FINALISE RREG LIVE RANGES ------ */
833 
834    if (DEBUG_REGALLOC) {
835       for (Int j = 0; j < n_vregs; j++) {
836          vex_printf("vreg %d:  la = %d,  db = %d\n",
837                     j, vreg_lrs[j].live_after, vreg_lrs[j].dead_before );
838       }
839    }
840 
841    if (DEBUG_REGALLOC) {
842       vex_printf("RRegLRs by LA:\n");
843       for (Int j = 0; j < rreg_lrs_used; j++) {
844          vex_printf("  ");
845          (*ppReg)(rreg_lrs_la[j].rreg);
846          vex_printf("      la = %d,  db = %d\n",
847                     rreg_lrs_la[j].live_after, rreg_lrs_la[j].dead_before );
848       }
849       vex_printf("RRegLRs by DB:\n");
850       for (Int j = 0; j < rreg_lrs_used; j++) {
851          vex_printf("  ");
852          (*ppReg)(rreg_lrs_db[j].rreg);
853          vex_printf("      la = %d,  db = %d\n",
854                     rreg_lrs_db[j].live_after, rreg_lrs_db[j].dead_before );
855       }
856    }
857 
858    /* --------- Stage 3: allocate spill slots. --------- */
859 
860    /* Each spill slot is 8 bytes long.  For vregs which take more than
861       64 bits to spill (classes Flt64 and Vec128), we have to allocate
862       two consecutive spill slots.  For 256 bit registers (class
863       Vec256), we have to allocate four consecutive spill slots.
864 
865       For Vec128-class on PowerPC, the spill slot's actual address
866       must be 16-byte aligned.  Since the spill slot's address is
867       computed as an offset from the guest state pointer, and since
868       the user of the generated code must set that pointer to a
869       32-aligned value, we have the residual obligation here of
870       choosing a 16-aligned spill slot offset for Vec128-class values.
871       Since each spill slot is 8 bytes long, that means for
872       Vec128-class values we must allocated a spill slot number which
873       is zero mod 2.
874 
875       Similarly, for Vec256 class on amd64, find a spill slot number
876       which is zero mod 4.  This guarantees it will be 32 byte
877       aligned, which isn't actually necessary on amd64 (we use movUpd
878       etc to spill), but seems like good practice.
879 
880       Do a rank-based allocation of vregs to spill slot numbers.  We
881       put as few values as possible in spill slots, but nevertheless
882       need to have a spill slot available for all vregs, just in case.
883    */
884    /* Int max_ss_no = -1; */
885 
886    local_memset(ss_busy_until_before, 0, sizeof(ss_busy_until_before));
887 
888    for (Int j = 0; j < n_vregs; j++) {
889 
890       /* True iff this vreg is unused.  In which case we also expect
891          that the reg_class field for it has not been set.  */
892       if (vreg_lrs[j].live_after == INVALID_INSTRNO) {
893          vassert(vreg_lrs[j].reg_class == HRcINVALID);
894          continue;
895       }
896 
897       /* The spill slots are 64 bits in size.  As per the comment on
898          definition of HRegClass in host_generic_regs.h, that means,
899          to spill a vreg of class Flt64 or Vec128, we'll need to find
900          two adjacent spill slots to use.  For Vec256, we'll need to
901          find four adjacent slots to use.  Note, this logic needs to
902          kept in sync with the size info on the definition of
903          HRegClass. */
904       Int ss_no = -1;
905       switch (vreg_lrs[j].reg_class) {
906 
907          case HRcVec128: case HRcFlt64:
908             /* Find two adjacent free slots in which between them
909                provide up to 128 bits in which to spill the vreg.
910                Since we are trying to find an even:odd pair, move
911                along in steps of 2 (slots). */
912             for (ss_no = 0; ss_no < N_SPILL64S-1; ss_no += 2)
913                if (ss_busy_until_before[ss_no+0] <= vreg_lrs[j].live_after
914                    && ss_busy_until_before[ss_no+1] <= vreg_lrs[j].live_after)
915                   break;
916             if (ss_no >= N_SPILL64S-1) {
917                vpanic("LibVEX_N_SPILL_BYTES is too low.  "
918                       "Increase and recompile.");
919             }
920             ss_busy_until_before[ss_no+0] = vreg_lrs[j].dead_before;
921             ss_busy_until_before[ss_no+1] = vreg_lrs[j].dead_before;
922             break;
923 
924          default:
925             /* The ordinary case -- just find a single spill slot. */
926             /* Find the lowest-numbered spill slot which is available
927                at the start point of this interval, and assign the
928                interval to it. */
929             for (ss_no = 0; ss_no < N_SPILL64S; ss_no++)
930                if (ss_busy_until_before[ss_no] <= vreg_lrs[j].live_after)
931                   break;
932             if (ss_no == N_SPILL64S) {
933                vpanic("LibVEX_N_SPILL_BYTES is too low.  "
934                       "Increase and recompile.");
935             }
936             ss_busy_until_before[ss_no] = vreg_lrs[j].dead_before;
937             break;
938 
939       } /* switch (vreg_lrs[j].reg_class) */
940 
941       /* This reflects LibVEX's hard-wired knowledge of the baseBlock
942          layout: the guest state, then two equal sized areas following
943          it for two sets of shadow state, and then the spill area. */
944       vreg_lrs[j].spill_offset = toShort(guest_sizeB * 3 + ss_no * 8);
945 
946       /* Independent check that we've made a sane choice of slot */
947       sanity_check_spill_offset( &vreg_lrs[j] );
948       /* if (j > max_ss_no) */
949       /*    max_ss_no = j; */
950    }
951 
952    if (0) {
953       vex_printf("\n\n");
954       for (Int j = 0; j < n_vregs; j++)
955          vex_printf("vreg %d    --> spill offset %d\n",
956                     j, vreg_lrs[j].spill_offset);
957    }
958 
959    /* --------- Stage 4: establish rreg preferences --------- */
960 
961    /* It may be advantageous to allocating certain vregs to specific
962       rregs, as a way of avoiding reg-reg moves later.  Here we
963       establish which, if any, rreg each vreg would prefer to be in.
964       Note that this constrains the allocator -- ideally we end up
965       with as few as possible vregs expressing a preference.
966 
967       This is an optimisation: if the .preferred_rreg field is never
968       set to anything different from INVALID_HREG, the allocator still
969       works. */
970 
971    /* 30 Dec 04: removed this mechanism as it does not seem to
972       help. */
973 
974    /* --------- Stage 5: process instructions --------- */
975 
976    /* This is the main loop of the allocator.  First, we need to
977       correctly set up our running state, which tracks the status of
978       each real register. */
979 
980    /* ------ BEGIN: Process each insn in turn. ------ */
981 
982    for (Int ii = 0; ii < instrs_in->arr_used; ii++) {
983 
984       if (DEBUG_REGALLOC) {
985          vex_printf("\n====----====---- Insn %d ----====----====\n", ii);
986          vex_printf("---- ");
987          (*ppInstr)(instrs_in->arr[ii], mode64);
988          vex_printf("\n\nInitial state:\n");
989          PRINT_STATE;
990          vex_printf("\n");
991       }
992 
993       /* ------------ Sanity checks ------------ */
994 
995       /* Sanity checks are expensive.  So they are done only once
996          every 13 instructions, and just before the last
997          instruction. */
998       do_sanity_check
999          = toBool(
1000               False /* Set to True for sanity checking of all insns. */
1001               || ii == instrs_in->arr_used-1
1002               || (ii > 0 && (ii % 13) == 0)
1003            );
1004 
1005       if (do_sanity_check) {
1006 
1007          /* Sanity check 1: all rregs with a hard live range crossing
1008             this insn must be marked as unavailable in the running
1009             state. */
1010          for (Int j = 0; j < rreg_lrs_used; j++) {
1011             if (rreg_lrs_la[j].live_after < ii
1012                 && ii < rreg_lrs_la[j].dead_before) {
1013                /* ii is the middle of a hard live range for some real
1014                   reg.  Check it's marked as such in the running
1015                   state. */
1016                HReg reg = rreg_lrs_la[j].rreg;
1017 
1018                if (0) {
1019                   vex_printf("considering la %d .. db %d   reg = ",
1020                              rreg_lrs_la[j].live_after,
1021                              rreg_lrs_la[j].dead_before);
1022                   (*ppReg)(reg);
1023                   vex_printf("\n");
1024                }
1025 
1026                /* assert that this rreg is marked as unavailable */
1027                vassert(!hregIsVirtual(reg));
1028                vassert(rreg_state[hregIndex(reg)].disp == Unavail);
1029             }
1030          }
1031 
1032          /* Sanity check 2: conversely, all rregs marked as
1033             unavailable in the running rreg_state must have a
1034             corresponding hard live range entry in the rreg_lrs
1035             array. */
1036          for (Int j = 0; j < n_rregs; j++) {
1037             vassert(rreg_state[j].disp == Bound
1038                     || rreg_state[j].disp == Free
1039                     || rreg_state[j].disp == Unavail);
1040             if (rreg_state[j].disp != Unavail)
1041                continue;
1042             Int k;
1043             for (k = 0; k < rreg_lrs_used; k++) {
1044                HReg reg = rreg_lrs_la[k].rreg;
1045                vassert(!hregIsVirtual(reg));
1046                if (hregIndex(reg) == j
1047                    && rreg_lrs_la[k].live_after < ii
1048                    && ii < rreg_lrs_la[k].dead_before)
1049                   break;
1050             }
1051             /* If this vassertion fails, we couldn't find a
1052                corresponding HLR. */
1053             vassert(k < rreg_lrs_used);
1054          }
1055 
1056          /* Sanity check 3: all vreg-rreg bindings must bind registers
1057             of the same class. */
1058          for (Int j = 0; j < n_rregs; j++) {
1059             if (rreg_state[j].disp != Bound) {
1060                vassert(rreg_state[j].eq_spill_slot == False);
1061                continue;
1062             }
1063             vassert(hregClass(univ->regs[j])
1064                     == hregClass(rreg_state[j].vreg));
1065             vassert( hregIsVirtual(rreg_state[j].vreg));
1066          }
1067 
1068          /* Sanity check 4: the vreg_state and rreg_state
1069             mutually-redundant mappings are consistent.  If
1070             rreg_state[j].vreg points at some vreg_state entry then
1071             that vreg_state entry should point back at
1072             rreg_state[j]. */
1073          for (Int j = 0; j < n_rregs; j++) {
1074             if (rreg_state[j].disp != Bound)
1075                continue;
1076             Int k = hregIndex(rreg_state[j].vreg);
1077             vassert(IS_VALID_VREGNO(k));
1078             vassert(vreg_state[k] == j);
1079          }
1080          for (Int j = 0; j < n_vregs; j++) {
1081             Int k = vreg_state[j];
1082             if (k == INVALID_RREG_NO)
1083                continue;
1084             vassert(IS_VALID_RREGNO(k));
1085             vassert(rreg_state[k].disp == Bound);
1086             vassert(hregIndex(rreg_state[k].vreg) == j);
1087          }
1088 
1089       } /* if (do_sanity_check) */
1090 
1091       /* ------------ end of Sanity checks ------------ */
1092 
1093       /* Do various optimisations pertaining to register coalescing
1094          and preferencing:
1095             MOV  v <-> v   coalescing (done here).
1096             MOV  v <-> r   coalescing (not yet, if ever)
1097       */
1098       /* If doing a reg-reg move between two vregs, and the src's live
1099          range ends here and the dst's live range starts here, bind
1100          the dst to the src's rreg, and that's all. */
1101       HReg vregS = INVALID_HREG;
1102       HReg vregD = INVALID_HREG;
1103       if ( (*isMove)( instrs_in->arr[ii], &vregS, &vregD ) ) {
1104          if (!hregIsVirtual(vregS)) goto cannot_coalesce;
1105          if (!hregIsVirtual(vregD)) goto cannot_coalesce;
1106          /* Check that *isMove is not telling us a bunch of lies ... */
1107          vassert(hregClass(vregS) == hregClass(vregD));
1108          Int k = hregIndex(vregS);
1109          Int m = hregIndex(vregD);
1110          vassert(IS_VALID_VREGNO(k));
1111          vassert(IS_VALID_VREGNO(m));
1112          if (vreg_lrs[k].dead_before != ii + 1) goto cannot_coalesce;
1113          if (vreg_lrs[m].live_after != ii) goto cannot_coalesce;
1114          if (DEBUG_REGALLOC) {
1115          vex_printf("COALESCE ");
1116             (*ppReg)(vregS);
1117             vex_printf(" -> ");
1118             (*ppReg)(vregD);
1119             vex_printf("\n\n");
1120          }
1121          /* Find the state entry for vregS. */
1122          Int n = vreg_state[k]; /* k is the index of vregS */
1123          if (n == INVALID_RREG_NO) {
1124             /* vregS is not currently in a real register.  So we can't
1125                do the coalescing.  Give up. */
1126             goto cannot_coalesce;
1127          }
1128          vassert(IS_VALID_RREGNO(n));
1129 
1130          /* Finally, we can do the coalescing.  It's trivial -- merely
1131             claim vregS's register for vregD. */
1132          rreg_state[n].vreg = vregD;
1133          vassert(IS_VALID_VREGNO(hregIndex(vregD)));
1134          vassert(IS_VALID_VREGNO(hregIndex(vregS)));
1135          vreg_state[hregIndex(vregD)] = toShort(n);
1136          vreg_state[hregIndex(vregS)] = INVALID_RREG_NO;
1137 
1138          /* This rreg has become associated with a different vreg and
1139             hence with a different spill slot.  Play safe. */
1140          rreg_state[n].eq_spill_slot = False;
1141 
1142          /* Move on to the next insn.  We skip the post-insn stuff for
1143             fixed registers, since this move should not interact with
1144             them in any way. */
1145          continue;
1146       }
1147      cannot_coalesce:
1148 
1149       /* ------ Free up rregs bound to dead vregs ------ */
1150 
1151       /* Look for vregs whose live range has just ended, and
1152 	 mark the associated rreg as free. */
1153 
1154       for (Int j = 0; j < n_rregs; j++) {
1155          if (rreg_state[j].disp != Bound)
1156             continue;
1157          UInt vregno = hregIndex(rreg_state[j].vreg);
1158          vassert(IS_VALID_VREGNO(vregno));
1159          if (vreg_lrs[vregno].dead_before <= ii) {
1160             rreg_state[j].disp = Free;
1161             rreg_state[j].eq_spill_slot = False;
1162             Int m = hregIndex(rreg_state[j].vreg);
1163             vassert(IS_VALID_VREGNO(m));
1164             vreg_state[m] = INVALID_RREG_NO;
1165             if (DEBUG_REGALLOC) {
1166                vex_printf("free up ");
1167                (*ppReg)(univ->regs[j]);
1168                vex_printf("\n");
1169             }
1170          }
1171       }
1172 
1173       /* ------ Pre-instruction actions for fixed rreg uses ------ */
1174 
1175       /* Now we have to deal with rregs which are about to be made
1176          live by this instruction -- in other words, are entering into
1177          one of their live ranges.  If any such rreg holds a vreg, we
1178          will have to free up the rreg.  The simplest solution which
1179          is correct is to spill the rreg.
1180 
1181          Note we could do better:
1182          * Could move it into some other free rreg, if one is available
1183 
1184          Do this efficiently, by incrementally stepping along an array
1185          of rreg HLRs that are known to be sorted by start point
1186          (their .live_after field).
1187       */
1188       while (True) {
1189          vassert(rreg_lrs_la_next >= 0);
1190          vassert(rreg_lrs_la_next <= rreg_lrs_used);
1191          if (rreg_lrs_la_next == rreg_lrs_used)
1192             break; /* no more real reg live ranges to consider */
1193          if (ii < rreg_lrs_la[rreg_lrs_la_next].live_after)
1194             break; /* next live range does not yet start */
1195          vassert(ii == rreg_lrs_la[rreg_lrs_la_next].live_after);
1196          /* rreg_lrs_la[rreg_lrs_la_next].rreg needs to be freed up.
1197             Find the associated rreg_state entry. */
1198          /* Note, re ii == rreg_lrs_la[rreg_lrs_la_next].live_after.
1199             Real register live ranges are guaranteed to be well-formed
1200             in that they start with a write to the register -- Stage 2
1201             rejects any code not satisfying this.  So the correct
1202             question to ask is whether
1203             rreg_lrs_la[rreg_lrs_la_next].live_after == ii, that is,
1204             whether the reg becomes live after this insn -- rather
1205             than before it. */
1206          if (DEBUG_REGALLOC) {
1207             vex_printf("need to free up rreg: ");
1208             (*ppReg)(rreg_lrs_la[rreg_lrs_la_next].rreg);
1209             vex_printf("\n\n");
1210          }
1211          Int k = hregIndex(rreg_lrs_la[rreg_lrs_la_next].rreg);
1212 
1213          /* If this fails, we don't have an entry for this rreg.
1214             Which we should. */
1215          vassert(IS_VALID_RREGNO(k));
1216          Int m = hregIndex(rreg_state[k].vreg);
1217          if (rreg_state[k].disp == Bound) {
1218             /* Yes, there is an associated vreg.  Spill it if it's
1219                still live. */
1220             vassert(IS_VALID_VREGNO(m));
1221             vreg_state[m] = INVALID_RREG_NO;
1222             if (vreg_lrs[m].dead_before > ii) {
1223                vassert(vreg_lrs[m].reg_class != HRcINVALID);
1224                if ((!eq_spill_opt) || !rreg_state[k].eq_spill_slot) {
1225                   HInstr* spill1 = NULL;
1226                   HInstr* spill2 = NULL;
1227                   (*genSpill)( &spill1, &spill2, univ->regs[k],
1228                                vreg_lrs[m].spill_offset, mode64 );
1229                   vassert(spill1 || spill2); /* can't both be NULL */
1230                   if (spill1)
1231                      EMIT_INSTR(spill1);
1232                   if (spill2)
1233                      EMIT_INSTR(spill2);
1234                }
1235                rreg_state[k].eq_spill_slot = True;
1236             }
1237          }
1238          rreg_state[k].disp = Unavail;
1239          rreg_state[k].vreg = INVALID_HREG;
1240          rreg_state[k].eq_spill_slot = False;
1241 
1242          /* check for further rregs entering HLRs at this point */
1243          rreg_lrs_la_next++;
1244       }
1245 
1246       if (DEBUG_REGALLOC) {
1247          vex_printf("After pre-insn actions for fixed regs:\n");
1248          PRINT_STATE;
1249          vex_printf("\n");
1250       }
1251 
1252       /* ------ Deal with the current instruction. ------ */
1253 
1254       /* Finally we can begin the processing of this instruction
1255          itself.  The aim is to free up enough rregs for this insn.
1256          This may generate spill stores since we may have to evict
1257          some vregs currently in rregs.  Also generates spill loads.
1258          We also build up the final vreg->rreg mapping to be applied
1259          to the insn. */
1260 
1261       initHRegRemap(&remap);
1262 
1263       /* ------------ BEGIN directReload optimisation ----------- */
1264 
1265       /* If the instruction reads exactly one vreg which is currently
1266          in a spill slot, and this is last use of that vreg, see if we
1267          can convert the instruction into one that reads directly from
1268          the spill slot.  This is clearly only possible for x86 and
1269          amd64 targets, since ppc and arm are load-store
1270          architectures.  If successful, replace instrs_in->arr[ii]
1271          with this new instruction, and recompute its reg usage, so
1272          that the change is invisible to the standard-case handling
1273          that follows. */
1274 
1275       if (directReload && reg_usage_arr[ii].n_vRegs <= 2) {
1276          Bool  debug_direct_reload = False;
1277          HReg  cand     = INVALID_HREG;
1278          Bool  nreads   = 0;
1279          Short spilloff = 0;
1280 
1281          for (Int j = 0; j < reg_usage_arr[ii].n_vRegs; j++) {
1282 
1283             HReg vreg = reg_usage_arr[ii].vRegs[j];
1284             vassert(hregIsVirtual(vreg));
1285 
1286             if (reg_usage_arr[ii].vMode[j] == HRmRead) {
1287                nreads++;
1288                Int m = hregIndex(vreg);
1289                vassert(IS_VALID_VREGNO(m));
1290                Int k = vreg_state[m];
1291                if (!IS_VALID_RREGNO(k)) {
1292                   /* ok, it is spilled.  Now, is this its last use? */
1293                   vassert(vreg_lrs[m].dead_before >= ii+1);
1294                   if (vreg_lrs[m].dead_before == ii+1
1295                       && hregIsInvalid(cand)) {
1296                      spilloff = vreg_lrs[m].spill_offset;
1297                      cand = vreg;
1298                   }
1299                }
1300             }
1301          }
1302 
1303          if (nreads == 1 && ! hregIsInvalid(cand)) {
1304             HInstr* reloaded;
1305             if (reg_usage_arr[ii].n_vRegs == 2)
1306                vassert(! sameHReg(reg_usage_arr[ii].vRegs[0],
1307                                   reg_usage_arr[ii].vRegs[1]));
1308 
1309             reloaded = directReload ( instrs_in->arr[ii], cand, spilloff );
1310             if (debug_direct_reload && !reloaded) {
1311                vex_printf("[%3d] ", spilloff); ppHReg(cand); vex_printf(" ");
1312                ppInstr(instrs_in->arr[ii], mode64);
1313             }
1314             if (reloaded) {
1315                /* Update info about the insn, so it looks as if it had
1316                   been in this form all along. */
1317                instrs_in->arr[ii] = reloaded;
1318                (*getRegUsage)( &reg_usage_arr[ii], instrs_in->arr[ii], mode64 );
1319                if (debug_direct_reload && !reloaded) {
1320                   vex_printf("  -->  ");
1321                   ppInstr(reloaded, mode64);
1322                }
1323             }
1324 
1325             if (debug_direct_reload && !reloaded)
1326                vex_printf("\n");
1327          }
1328 
1329       }
1330 
1331       /* ------------ END directReload optimisation ------------ */
1332 
1333       /* for each virtual reg mentioned in the insn ... */
1334       for (Int j = 0; j < reg_usage_arr[ii].n_vRegs; j++) {
1335 
1336          HReg vreg = reg_usage_arr[ii].vRegs[j];
1337          vassert(hregIsVirtual(vreg));
1338 
1339          if (0) {
1340             vex_printf("considering "); (*ppReg)(vreg); vex_printf("\n");
1341          }
1342 
1343          /* Now we're trying to find a rreg for "vreg".  First of all,
1344             if it already has an rreg assigned, we don't need to do
1345             anything more.  Inspect the current state to find out. */
1346          Int m = hregIndex(vreg);
1347          vassert(IS_VALID_VREGNO(m));
1348          Int n = vreg_state[m];
1349          if (IS_VALID_RREGNO(n)) {
1350             vassert(rreg_state[n].disp == Bound);
1351             addToHRegRemap(&remap, vreg, univ->regs[n]);
1352             /* If this rreg is written or modified, mark it as different
1353                from any spill slot value. */
1354             if (reg_usage_arr[ii].vMode[j] != HRmRead)
1355                rreg_state[n].eq_spill_slot = False;
1356             continue;
1357          } else {
1358             vassert(n == INVALID_RREG_NO);
1359          }
1360 
1361          /* No luck.  The next thing to do is see if there is a
1362             currently free rreg available, of the correct class.  If
1363             so, bag it.  NOTE, we could improve this by selecting an
1364             rreg for which the next live-range event is as far ahead
1365             as possible. */
1366          Int k_suboptimal = -1;
1367          Int k;
1368          for (k = 0; k < n_rregs; k++) {
1369             if (rreg_state[k].disp != Free
1370                 || hregClass(univ->regs[k]) != hregClass(vreg))
1371                continue;
1372             if (rreg_state[k].has_hlrs) {
1373                /* Well, at least we can use k_suboptimal if we really
1374                   have to.  Keep on looking for a better candidate. */
1375                k_suboptimal = k;
1376             } else {
1377                /* Found a preferable reg.  Use it. */
1378                k_suboptimal = -1;
1379                break;
1380             }
1381          }
1382          if (k_suboptimal >= 0)
1383             k = k_suboptimal;
1384 
1385          if (k < n_rregs) {
1386             rreg_state[k].disp = Bound;
1387             rreg_state[k].vreg = vreg;
1388             Int p = hregIndex(vreg);
1389             vassert(IS_VALID_VREGNO(p));
1390             vreg_state[p] = toShort(k);
1391             addToHRegRemap(&remap, vreg, univ->regs[k]);
1392             /* Generate a reload if needed.  This only creates needed
1393                reloads because the live range builder for vregs will
1394                guarantee that the first event for a vreg is a write.
1395                Hence, if this reference is not a write, it cannot be
1396                the first reference for this vreg, and so a reload is
1397                indeed needed. */
1398             if (reg_usage_arr[ii].vMode[j] != HRmWrite) {
1399                vassert(vreg_lrs[p].reg_class != HRcINVALID);
1400                HInstr* reload1 = NULL;
1401                HInstr* reload2 = NULL;
1402                (*genReload)( &reload1, &reload2, univ->regs[k],
1403                              vreg_lrs[p].spill_offset, mode64 );
1404                vassert(reload1 || reload2); /* can't both be NULL */
1405                if (reload1)
1406                   EMIT_INSTR(reload1);
1407                if (reload2)
1408                   EMIT_INSTR(reload2);
1409                /* This rreg is read or modified by the instruction.
1410                   If it's merely read we can claim it now equals the
1411                   spill slot, but not so if it is modified. */
1412                if (reg_usage_arr[ii].vMode[j] == HRmRead) {
1413                   rreg_state[k].eq_spill_slot = True;
1414                } else {
1415                   vassert(reg_usage_arr[ii].vMode[j] == HRmModify);
1416                   rreg_state[k].eq_spill_slot = False;
1417                }
1418             } else {
1419                rreg_state[k].eq_spill_slot = False;
1420             }
1421 
1422             continue;
1423          }
1424 
1425          /* Well, now we have no option but to spill a vreg.  It's
1426             important to make a good choice of vreg to spill, and of
1427             course we need to be careful not to spill a vreg which is
1428             needed by this insn. */
1429 
1430          /* First, mark in the rreg_state, those rregs which are not spill
1431             candidates, due to holding a vreg mentioned by this
1432             instruction.  Or being of the wrong class. */
1433          for (k = 0; k < n_rregs; k++) {
1434             rreg_state[k].is_spill_cand = False;
1435             if (rreg_state[k].disp != Bound)
1436                continue;
1437             if (hregClass(univ->regs[k]) != hregClass(vreg))
1438                continue;
1439             rreg_state[k].is_spill_cand = True;
1440             /* Note, the following loop visits only the virtual regs
1441                mentioned by the instruction. */
1442             for (m = 0; m < reg_usage_arr[ii].n_vRegs; m++) {
1443                if (sameHReg(rreg_state[k].vreg, reg_usage_arr[ii].vRegs[m])) {
1444                   rreg_state[k].is_spill_cand = False;
1445                   break;
1446                }
1447             }
1448          }
1449 
1450          /* We can choose to spill any rreg satisfying
1451             rreg_state[r].is_spill_cand (so to speak).  Choose r so that
1452             the next use of its associated vreg is as far ahead as
1453             possible, in the hope that this will minimise the number
1454             of consequent reloads required. */
1455          Int spillee
1456             = findMostDistantlyMentionedVReg (
1457                  reg_usage_arr, ii+1, instrs_in->arr_used, rreg_state, n_rregs );
1458 
1459          if (spillee == -1) {
1460             /* Hmmmmm.  There don't appear to be any spill candidates.
1461                We're hosed. */
1462             vex_printf("reg_alloc: can't find a register in class: ");
1463             ppHRegClass(hregClass(vreg));
1464             vex_printf("\n");
1465             vpanic("reg_alloc: can't create a free register.");
1466          }
1467 
1468          /* Right.  So we're going to spill rreg_state[spillee]. */
1469          vassert(IS_VALID_RREGNO(spillee));
1470          vassert(rreg_state[spillee].disp == Bound);
1471          /* check it's the right class */
1472          vassert(hregClass(univ->regs[spillee]) == hregClass(vreg));
1473          /* check we're not ejecting the vreg for which we are trying
1474             to free up a register. */
1475          vassert(! sameHReg(rreg_state[spillee].vreg, vreg));
1476 
1477          m = hregIndex(rreg_state[spillee].vreg);
1478          vassert(IS_VALID_VREGNO(m));
1479 
1480          /* So here's the spill store.  Assert that we're spilling a
1481             live vreg. */
1482          vassert(vreg_lrs[m].dead_before > ii);
1483          vassert(vreg_lrs[m].reg_class != HRcINVALID);
1484          if ((!eq_spill_opt) || !rreg_state[spillee].eq_spill_slot) {
1485             HInstr* spill1 = NULL;
1486             HInstr* spill2 = NULL;
1487             (*genSpill)( &spill1, &spill2, univ->regs[spillee],
1488                          vreg_lrs[m].spill_offset, mode64 );
1489             vassert(spill1 || spill2); /* can't both be NULL */
1490             if (spill1)
1491                EMIT_INSTR(spill1);
1492             if (spill2)
1493                EMIT_INSTR(spill2);
1494          }
1495 
1496          /* Update the rreg_state to reflect the new assignment for this
1497             rreg. */
1498          rreg_state[spillee].vreg = vreg;
1499          vreg_state[m] = INVALID_RREG_NO;
1500 
1501          rreg_state[spillee].eq_spill_slot = False; /* be safe */
1502 
1503          m = hregIndex(vreg);
1504          vassert(IS_VALID_VREGNO(m));
1505          vreg_state[m] = toShort(spillee);
1506 
1507          /* Now, if this vreg is being read or modified (as opposed to
1508             written), we have to generate a reload for it. */
1509          if (reg_usage_arr[ii].vMode[j] != HRmWrite) {
1510             vassert(vreg_lrs[m].reg_class != HRcINVALID);
1511             HInstr* reload1 = NULL;
1512             HInstr* reload2 = NULL;
1513             (*genReload)( &reload1, &reload2, univ->regs[spillee],
1514                           vreg_lrs[m].spill_offset, mode64 );
1515             vassert(reload1 || reload2); /* can't both be NULL */
1516             if (reload1)
1517                EMIT_INSTR(reload1);
1518             if (reload2)
1519                EMIT_INSTR(reload2);
1520             /* This rreg is read or modified by the instruction.
1521                If it's merely read we can claim it now equals the
1522                spill slot, but not so if it is modified. */
1523             if (reg_usage_arr[ii].vMode[j] == HRmRead) {
1524                rreg_state[spillee].eq_spill_slot = True;
1525             } else {
1526                vassert(reg_usage_arr[ii].vMode[j] == HRmModify);
1527                rreg_state[spillee].eq_spill_slot = False;
1528             }
1529          }
1530 
1531          /* So after much twisting and turning, we have vreg mapped to
1532             rreg_state[spillee].rreg.  Note that in the map. */
1533          addToHRegRemap(&remap, vreg, univ->regs[spillee]);
1534 
1535       } /* iterate over virtual registers in this instruction. */
1536 
1537       /* We've finished clowning around with registers in this instruction.
1538          Three results:
1539          - the running rreg_state[] has been updated
1540          - a suitable vreg->rreg mapping for this instruction has been
1541            constructed
1542          - spill and reload instructions may have been emitted.
1543 
1544         The final step is to apply the mapping to the instruction,
1545         and emit that.
1546       */
1547 
1548       /* NOTE, DESTRUCTIVELY MODIFIES instrs_in->arr[ii]. */
1549       (*mapRegs)( &remap, instrs_in->arr[ii], mode64 );
1550       EMIT_INSTR( instrs_in->arr[ii] );
1551 
1552       if (DEBUG_REGALLOC) {
1553          vex_printf("After dealing with current insn:\n");
1554          PRINT_STATE;
1555          vex_printf("\n");
1556       }
1557 
1558       /* ------ Post-instruction actions for fixed rreg uses ------ */
1559 
1560       /* Now we need to check for rregs exiting fixed live ranges
1561          after this instruction, and if so mark them as free. */
1562       while (True) {
1563          vassert(rreg_lrs_db_next >= 0);
1564          vassert(rreg_lrs_db_next <= rreg_lrs_used);
1565          if (rreg_lrs_db_next == rreg_lrs_used)
1566             break; /* no more real reg live ranges to consider */
1567          if (ii+1 < rreg_lrs_db[rreg_lrs_db_next].dead_before)
1568             break; /* next live range does not yet start */
1569          vassert(ii+1 == rreg_lrs_db[rreg_lrs_db_next].dead_before);
1570          /* rreg_lrs_db[[rreg_lrs_db_next].rreg is exiting a hard live
1571             range.  Mark it as such in the main rreg_state array. */
1572          HReg reg = rreg_lrs_db[rreg_lrs_db_next].rreg;
1573          vassert(!hregIsVirtual(reg));
1574          Int k = hregIndex(reg);
1575          vassert(IS_VALID_RREGNO(k));
1576          vassert(rreg_state[k].disp == Unavail);
1577          rreg_state[k].disp = Free;
1578          rreg_state[k].vreg = INVALID_HREG;
1579          rreg_state[k].eq_spill_slot = False;
1580 
1581          /* check for further rregs leaving HLRs at this point */
1582          rreg_lrs_db_next++;
1583       }
1584 
1585       if (DEBUG_REGALLOC) {
1586          vex_printf("After post-insn actions for fixed regs:\n");
1587          PRINT_STATE;
1588          vex_printf("\n");
1589       }
1590 
1591    } /* iterate over insns */
1592 
1593    /* ------ END: Process each insn in turn. ------ */
1594 
1595    /* free(rreg_state); */
1596    /* free(rreg_lrs); */
1597    /* if (vreg_lrs) free(vreg_lrs); */
1598 
1599    /* Paranoia */
1600    vassert(rreg_lrs_la_next == rreg_lrs_used);
1601    vassert(rreg_lrs_db_next == rreg_lrs_used);
1602 
1603    return instrs_out;
1604 
1605 #  undef INVALID_INSTRNO
1606 #  undef EMIT_INSTR
1607 #  undef PRINT_STATE
1608 }
1609 
1610 
1611 
1612 /*---------------------------------------------------------------*/
1613 /*---                                       host_reg_alloc2.c ---*/
1614 /*---------------------------------------------------------------*/
1615