1 /*--------------------------------------------------------------------*/
2 /*--- Callgrind                                                    ---*/
3 /*---                                                       bbcc.c ---*/
4 /*--------------------------------------------------------------------*/
5 
6 /*
7    This file is part of Callgrind, a Valgrind tool for call tracing.
8 
9    Copyright (C) 2002-2015, Josef Weidendorfer (Josef.Weidendorfer@gmx.de)
10 
11    This program is free software; you can redistribute it and/or
12    modify it under the terms of the GNU General Public License as
13    published by the Free Software Foundation; either version 2 of the
14    License, or (at your option) any later version.
15 
16    This program is distributed in the hope that it will be useful, but
17    WITHOUT ANY WARRANTY; without even the implied warranty of
18    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19    General Public License for more details.
20 
21    You should have received a copy of the GNU General Public License
22    along with this program; if not, write to the Free Software
23    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
24    02111-1307, USA.
25 
26    The GNU General Public License is contained in the file COPYING.
27 */
28 
29 #include "global.h"
30 #include "costs.h"
31 
32 #include "pub_tool_threadstate.h"
33 
34 /*------------------------------------------------------------*/
35 /*--- BBCC operations                                      ---*/
36 /*------------------------------------------------------------*/
37 
38 #define N_BBCC_INITIAL_ENTRIES  10437
39 
40 /* BBCC table (key is BB/Context), per thread, resizable */
41 bbcc_hash current_bbccs;
42 
CLG_(init_bbcc_hash)43 void CLG_(init_bbcc_hash)(bbcc_hash* bbccs)
44 {
45    Int i;
46 
47    CLG_ASSERT(bbccs != 0);
48 
49    bbccs->size    = N_BBCC_INITIAL_ENTRIES;
50    bbccs->entries = 0;
51    bbccs->table = (BBCC**) CLG_MALLOC("cl.bbcc.ibh.1",
52                                       bbccs->size * sizeof(BBCC*));
53 
54    for (i = 0; i < bbccs->size; i++) bbccs->table[i] = NULL;
55 }
56 
CLG_(copy_current_bbcc_hash)57 void CLG_(copy_current_bbcc_hash)(bbcc_hash* dst)
58 {
59   CLG_ASSERT(dst != 0);
60 
61   dst->size    = current_bbccs.size;
62   dst->entries = current_bbccs.entries;
63   dst->table   = current_bbccs.table;
64 }
65 
CLG_(get_current_bbcc_hash)66 bbcc_hash* CLG_(get_current_bbcc_hash)()
67 {
68   return &current_bbccs;
69 }
70 
CLG_(set_current_bbcc_hash)71 void CLG_(set_current_bbcc_hash)(bbcc_hash* h)
72 {
73   CLG_ASSERT(h != 0);
74 
75   current_bbccs.size    = h->size;
76   current_bbccs.entries = h->entries;
77   current_bbccs.table   = h->table;
78 }
79 
80 /*
81  * Zero all costs of a BBCC
82  */
CLG_(zero_bbcc)83 void CLG_(zero_bbcc)(BBCC* bbcc)
84 {
85   Int i;
86   jCC* jcc;
87 
88   CLG_ASSERT(bbcc->cxt != 0);
89   CLG_DEBUG(1, "  zero_bbcc: BB %#lx, Cxt %u "
90 	   "(fn '%s', rec %u)\n",
91 	   bb_addr(bbcc->bb),
92 	   bbcc->cxt->base_number + bbcc->rec_index,
93 	   bbcc->cxt->fn[0]->name,
94 	   bbcc->rec_index);
95 
96   if ((bbcc->ecounter_sum ==0) &&
97       (bbcc->ret_counter ==0)) return;
98 
99   for(i=0;i<bbcc->bb->cost_count;i++)
100     bbcc->cost[i] = 0;
101   for(i=0;i <= bbcc->bb->cjmp_count;i++) {
102     bbcc->jmp[i].ecounter = 0;
103     for(jcc=bbcc->jmp[i].jcc_list; jcc; jcc=jcc->next_from)
104 	CLG_(init_cost)( CLG_(sets).full, jcc->cost );
105   }
106   bbcc->ecounter_sum = 0;
107   bbcc->ret_counter = 0;
108 }
109 
110 
111 
CLG_(forall_bbccs)112 void CLG_(forall_bbccs)(void (*func)(BBCC*))
113 {
114   BBCC *bbcc, *bbcc2;
115   int i, j;
116 
117   for (i = 0; i < current_bbccs.size; i++) {
118     if ((bbcc=current_bbccs.table[i]) == NULL) continue;
119     while (bbcc) {
120       /* every bbcc should have a rec_array */
121       CLG_ASSERT(bbcc->rec_array != 0);
122 
123       for(j=0;j<bbcc->cxt->fn[0]->separate_recursions;j++) {
124 	if ((bbcc2 = bbcc->rec_array[j]) == 0) continue;
125 
126 	(*func)(bbcc2);
127       }
128       bbcc = bbcc->next;
129     }
130   }
131 }
132 
133 
134 /* All BBCCs for recursion level 0 are inserted into a
135  * thread specific hash table with key
136  * - address of BB structure (unique, as never freed)
137  * - current context (includes caller chain)
138  * BBCCs for other recursion levels are in bbcc->rec_array.
139  *
140  * The hash is used in setup_bb(), i.e. to find the cost
141  * counters to be changed in the execution of a BB.
142  */
143 
144 static __inline__
bbcc_hash_idx(BB * bb,Context * cxt,UInt size)145 UInt bbcc_hash_idx(BB* bb, Context* cxt, UInt size)
146 {
147    CLG_ASSERT(bb != 0);
148    CLG_ASSERT(cxt != 0);
149 
150    return ((Addr)bb + (Addr)cxt) % size;
151 }
152 
153 
154 /* Lookup for a BBCC in hash.
155  */
156 static
lookup_bbcc(BB * bb,Context * cxt)157 BBCC* lookup_bbcc(BB* bb, Context* cxt)
158 {
159    BBCC* bbcc = bb->last_bbcc;
160    UInt  idx;
161 
162    /* check LRU */
163    if (bbcc->cxt == cxt) {
164        if (!CLG_(clo).separate_threads) {
165 	   /* if we don't dump threads separate, tid doesn't have to match */
166 	   return bbcc;
167        }
168        if (bbcc->tid == CLG_(current_tid)) return bbcc;
169    }
170 
171    CLG_(stat).bbcc_lru_misses++;
172 
173    idx = bbcc_hash_idx(bb, cxt, current_bbccs.size);
174    bbcc = current_bbccs.table[idx];
175    while (bbcc &&
176 	  (bb      != bbcc->bb ||
177 	   cxt     != bbcc->cxt)) {
178        bbcc = bbcc->next;
179    }
180 
181    CLG_DEBUG(2,"  lookup_bbcc(BB %#lx, Cxt %u, fn '%s'): %p (tid %u)\n",
182 	    bb_addr(bb), cxt->base_number, cxt->fn[0]->name,
183 	    bbcc, bbcc ? bbcc->tid : 0);
184 
185    CLG_DEBUGIF(2)
186      if (bbcc) CLG_(print_bbcc)(-2,bbcc);
187 
188    return bbcc;
189 }
190 
191 
192 /* double size of hash table 1 (addr->BBCC) */
resize_bbcc_hash(void)193 static void resize_bbcc_hash(void)
194 {
195     Int i, new_size, conflicts1 = 0, conflicts2 = 0;
196     BBCC** new_table;
197     UInt new_idx;
198     BBCC *curr_BBCC, *next_BBCC;
199 
200     new_size = 2*current_bbccs.size+3;
201     new_table = (BBCC**) CLG_MALLOC("cl.bbcc.rbh.1",
202                                     new_size * sizeof(BBCC*));
203 
204     for (i = 0; i < new_size; i++)
205       new_table[i] = NULL;
206 
207     for (i = 0; i < current_bbccs.size; i++) {
208 	if (current_bbccs.table[i] == NULL) continue;
209 
210 	curr_BBCC = current_bbccs.table[i];
211 	while (NULL != curr_BBCC) {
212 	    next_BBCC = curr_BBCC->next;
213 
214 	    new_idx = bbcc_hash_idx(curr_BBCC->bb,
215 				    curr_BBCC->cxt,
216 				    new_size);
217 
218 	    curr_BBCC->next = new_table[new_idx];
219 	    new_table[new_idx] = curr_BBCC;
220 	    if (curr_BBCC->next) {
221 		conflicts1++;
222 		if (curr_BBCC->next->next)
223 		    conflicts2++;
224 	    }
225 
226 	    curr_BBCC = next_BBCC;
227 	}
228     }
229 
230     VG_(free)(current_bbccs.table);
231 
232 
233     CLG_DEBUG(0,"Resize BBCC Hash: %u => %d (entries %u, conflicts %d/%d)\n",
234 	     current_bbccs.size, new_size,
235 	     current_bbccs.entries, conflicts1, conflicts2);
236 
237     current_bbccs.size = new_size;
238     current_bbccs.table = new_table;
239     CLG_(stat).bbcc_hash_resizes++;
240 }
241 
242 
243 static __inline
new_recursion(int size)244 BBCC** new_recursion(int size)
245 {
246     BBCC** bbccs;
247     int i;
248 
249     bbccs = (BBCC**) CLG_MALLOC("cl.bbcc.nr.1", sizeof(BBCC*) * size);
250     for(i=0;i<size;i++)
251 	bbccs[i] = 0;
252 
253     CLG_DEBUG(3,"  new_recursion(size %d): %p\n", size, bbccs);
254 
255     return bbccs;
256 }
257 
258 
259 /*
260  * Allocate a new BBCC
261  *
262  * Uninitialized:
263  * cxt, rec_index, rec_array, next_bbcc, next1, next2
264  */
265 static __inline__
new_bbcc(BB * bb)266 BBCC* new_bbcc(BB* bb)
267 {
268    BBCC* bbcc;
269    Int i;
270 
271    /* We need cjmp_count+1 JmpData structs:
272     * the last is for the unconditional jump/call/ret at end of BB
273     */
274    bbcc = (BBCC*)CLG_MALLOC("cl.bbcc.nb.1",
275 			    sizeof(BBCC) +
276 			    (bb->cjmp_count+1) * sizeof(JmpData));
277    bbcc->bb  = bb;
278    bbcc->tid = CLG_(current_tid);
279 
280    bbcc->ret_counter = 0;
281    bbcc->skipped = 0;
282    bbcc->cost = CLG_(get_costarray)(bb->cost_count);
283    for(i=0;i<bb->cost_count;i++)
284      bbcc->cost[i] = 0;
285    for(i=0; i<=bb->cjmp_count; i++) {
286        bbcc->jmp[i].ecounter = 0;
287        bbcc->jmp[i].jcc_list = 0;
288    }
289    bbcc->ecounter_sum = 0;
290 
291    /* Init pointer caches (LRU) */
292    bbcc->lru_next_bbcc = 0;
293    bbcc->lru_from_jcc  = 0;
294    bbcc->lru_to_jcc  = 0;
295 
296    CLG_(stat).distinct_bbccs++;
297 
298    CLG_DEBUG(3, "  new_bbcc(BB %#lx): %p (now %d)\n",
299 	    bb_addr(bb), bbcc, CLG_(stat).distinct_bbccs);
300 
301    return bbcc;
302 }
303 
304 
305 /**
306  * Inserts a new BBCC into hashes.
307  * BBCC specific items must be set as this is used for the hash
308  * keys:
309  *  fn     : current function
310  *  tid    : current thread ID
311  *  from   : position where current function is called from
312  *
313  * Recursion level doesn't need to be set as this is not included
314  * in the hash key: Only BBCCs with rec level 0 are in hashes.
315  */
316 static
insert_bbcc_into_hash(BBCC * bbcc)317 void insert_bbcc_into_hash(BBCC* bbcc)
318 {
319     UInt idx;
320 
321     CLG_ASSERT(bbcc->cxt != 0);
322 
323     CLG_DEBUG(3,"+ insert_bbcc_into_hash(BB %#lx, fn '%s')\n",
324 	     bb_addr(bbcc->bb), bbcc->cxt->fn[0]->name);
325 
326     /* check fill degree of hash and resize if needed (>90%) */
327     current_bbccs.entries++;
328     if (100 * current_bbccs.entries / current_bbccs.size > 90)
329 	resize_bbcc_hash();
330 
331     idx = bbcc_hash_idx(bbcc->bb, bbcc->cxt, current_bbccs.size);
332     bbcc->next = current_bbccs.table[idx];
333     current_bbccs.table[idx] = bbcc;
334 
335     CLG_DEBUG(3,"- insert_bbcc_into_hash: %u entries\n",
336 	     current_bbccs.entries);
337 }
338 
339 /* String is returned in a dynamically allocated buffer. Caller is
340    responsible for free'ing it. */
mangled_cxt(const Context * cxt,Int rec_index)341 static HChar* mangled_cxt(const Context* cxt, Int rec_index)
342 {
343     Int i, p;
344 
345     if (!cxt) return VG_(strdup)("cl.bbcc.mcxt", "(no context)");
346 
347     /* Overestimate the number of bytes we need to hold the string. */
348     SizeT need = 20;   // rec_index + nul-terminator
349     for (i = 0; i < cxt->size; ++i)
350        need += VG_(strlen)(cxt->fn[i]->name) + 1;   // 1 for leading '
351 
352     HChar *mangled = CLG_MALLOC("cl.bbcc.mcxt", need);
353     p = VG_(sprintf)(mangled, "%s", cxt->fn[0]->name);
354     if (rec_index >0)
355 	p += VG_(sprintf)(mangled+p, "'%d", rec_index +1);
356     for(i=1;i<cxt->size;i++)
357 	p += VG_(sprintf)(mangled+p, "'%s", cxt->fn[i]->name);
358 
359     return mangled;
360 }
361 
362 
363 /* Create a new BBCC as a copy of an existing one,
364  * but with costs set to 0 and jcc chains empty.
365  *
366  * This is needed when a BB is executed in another context than
367  * the one at instrumentation time of the BB.
368  *
369  * Use cases:
370  *  rec_index == 0: clone from a BBCC with differing tid/cxt
371  *                  and insert into hashes
372  *  rec_index >0  : clone from a BBCC with same tid/cxt and rec_index 0
373  *                  don't insert into hashes
374  */
clone_bbcc(BBCC * orig,Context * cxt,Int rec_index)375 static BBCC* clone_bbcc(BBCC* orig, Context* cxt, Int rec_index)
376 {
377     BBCC* bbcc;
378 
379     CLG_DEBUG(3,"+ clone_bbcc(BB %#lx, rec %d, fn %s)\n",
380 	     bb_addr(orig->bb), rec_index, cxt->fn[0]->name);
381 
382     bbcc = new_bbcc(orig->bb);
383 
384     if (rec_index == 0) {
385 
386       /* hash insertion is only allowed if tid or cxt is different */
387       CLG_ASSERT((orig->tid != CLG_(current_tid)) ||
388 		(orig->cxt != cxt));
389 
390       bbcc->rec_index = 0;
391       bbcc->cxt = cxt;
392       bbcc->rec_array = new_recursion(cxt->fn[0]->separate_recursions);
393       bbcc->rec_array[0] = bbcc;
394 
395       insert_bbcc_into_hash(bbcc);
396     }
397     else {
398       if (CLG_(clo).separate_threads)
399 	CLG_ASSERT(orig->tid == CLG_(current_tid));
400 
401       CLG_ASSERT(orig->cxt == cxt);
402       CLG_ASSERT(orig->rec_array);
403       CLG_ASSERT(cxt->fn[0]->separate_recursions > rec_index);
404       CLG_ASSERT(orig->rec_array[rec_index] ==0);
405 
406       /* new BBCC will only have differing recursion level */
407       bbcc->rec_index = rec_index;
408       bbcc->cxt = cxt;
409       bbcc->rec_array = orig->rec_array;
410       bbcc->rec_array[rec_index] = bbcc;
411     }
412 
413     /* update list of BBCCs for same BB */
414     bbcc->next_bbcc = orig->bb->bbcc_list;
415     orig->bb->bbcc_list = bbcc;
416 
417 
418     CLG_DEBUGIF(3)
419       CLG_(print_bbcc)(-2, bbcc);
420 
421     HChar *mangled_orig = mangled_cxt(orig->cxt, orig->rec_index);
422     HChar *mangled_bbcc = mangled_cxt(bbcc->cxt, bbcc->rec_index);
423     CLG_DEBUG(2,"- clone_BBCC(%p, %d) for BB %#lx\n"
424 		"   orig %s\n"
425 		"   new  %s\n",
426 	     orig, rec_index, bb_addr(orig->bb),
427              mangled_orig,
428              mangled_bbcc);
429     CLG_FREE(mangled_orig);
430     CLG_FREE(mangled_bbcc);
431 
432     CLG_(stat).bbcc_clones++;
433 
434     return bbcc;
435 };
436 
437 
438 
439 /* Get a pointer to the cost centre structure for given basic block
440  * address. If created, the BBCC is inserted into the BBCC hash.
441  * Also sets BB_seen_before by reference.
442  *
443  */
CLG_(get_bbcc)444 BBCC* CLG_(get_bbcc)(BB* bb)
445 {
446    BBCC* bbcc;
447 
448    CLG_DEBUG(3, "+ get_bbcc(BB %#lx)\n", bb_addr(bb));
449 
450    bbcc = bb->bbcc_list;
451 
452    if (!bbcc) {
453      bbcc = new_bbcc(bb);
454 
455      /* initialize BBCC */
456      bbcc->cxt       = 0;
457      bbcc->rec_array = 0;
458      bbcc->rec_index = 0;
459 
460      bbcc->next_bbcc = bb->bbcc_list;
461      bb->bbcc_list = bbcc;
462      bb->last_bbcc = bbcc;
463 
464      CLG_DEBUGIF(3)
465        CLG_(print_bbcc)(-2, bbcc);
466    }
467 
468    CLG_DEBUG(3, "- get_bbcc(BB %#lx): BBCC %p\n",
469 		bb_addr(bb), bbcc);
470 
471    return bbcc;
472 }
473 
474 
475 /* Callgrind manages its own call stack for each thread.
476  * When leaving a function, a underflow can happen when
477  * Callgrind's tracing was switched on in the middle of
478  * a run, i.e. when Callgrind was not able to trace the
479  * call instruction.
480  * This function tries to reconstruct the original call.
481  * As we know the return address (the address following
482  * the CALL instruction), we can detect the function
483  * we return back to, but the original call site is unknown.
484  * We suppose a call site at return address - 1.
485  * (TODO: other heuristic: lookup info of instrumented BBs).
486  */
handleUnderflow(BB * bb)487 static void handleUnderflow(BB* bb)
488 {
489   /* RET at top of call stack */
490   BBCC* source_bbcc;
491   BB* source_bb;
492   Bool seen_before;
493   fn_node* caller;
494   int fn_number;
495   unsigned *pactive;
496   call_entry* call_entry_up;
497 
498   CLG_DEBUG(1,"  Callstack underflow !\n");
499 
500   /* we emulate an old call from the function we return to
501    * by using (<return address> -1) */
502   source_bb = CLG_(get_bb)(bb_addr(bb)-1, 0, &seen_before);
503   source_bbcc = CLG_(get_bbcc)(source_bb);
504 
505   /* seen_before can be true if RET from a signal handler */
506   if (!seen_before) {
507     source_bbcc->ecounter_sum = CLG_(current_state).collect ? 1 : 0;
508   }
509   else if (CLG_(current_state).collect)
510     source_bbcc->ecounter_sum++;
511 
512   /* Force a new top context, will be set active by push_cxt() */
513   CLG_(current_fn_stack).top--;
514   CLG_(current_state).cxt = 0;
515   caller = CLG_(get_fn_node)(bb);
516   CLG_(push_cxt)( caller );
517 
518   if (!seen_before) {
519     /* set rec array for source BBCC: this is at rec level 1 */
520     source_bbcc->rec_array = new_recursion(caller->separate_recursions);
521     source_bbcc->rec_array[0] = source_bbcc;
522 
523     CLG_ASSERT(source_bbcc->cxt == 0);
524     source_bbcc->cxt = CLG_(current_state).cxt;
525     insert_bbcc_into_hash(source_bbcc);
526   }
527   CLG_ASSERT(CLG_(current_state).bbcc);
528 
529   /* correct active counts */
530   fn_number = CLG_(current_state).bbcc->cxt->fn[0]->number;
531   pactive = CLG_(get_fn_entry)(fn_number);
532   (*pactive)--;
533 
534   /* This assertion is not correct for reentrant
535    * signal handlers */
536   /* CLG_ASSERT(*pactive == 0); */
537 
538   CLG_(current_state).nonskipped = 0; /* we didn't skip this function */
539   /* back to current context */
540   CLG_(push_cxt)( CLG_(current_state).bbcc->cxt->fn[0] );
541   CLG_(push_call_stack)(source_bbcc, 0, CLG_(current_state).bbcc,
542 		       (Addr)-1, False);
543   call_entry_up =
544     &(CLG_(current_call_stack).entry[CLG_(current_call_stack).sp -1]);
545   /* assume this call is lasting since last dump or
546    * for a signal handler since it's call */
547   if (CLG_(current_state).sig == 0)
548     CLG_(copy_cost)( CLG_(sets).full, call_entry_up->enter_cost,
549 		    CLG_(get_current_thread)()->lastdump_cost );
550   else
551     CLG_(zero_cost)( CLG_(sets).full, call_entry_up->enter_cost );
552 }
553 
554 
555 /*
556  * Helper function called at start of each instrumented BB to setup
557  * pointer to costs for current thread/context/recursion level
558  */
559 
560 VG_REGPARM(1)
CLG_(setup_bbcc)561 void CLG_(setup_bbcc)(BB* bb)
562 {
563   BBCC *bbcc, *last_bbcc;
564   Bool  call_emulation = False, delayed_push = False, skip = False;
565   Addr sp;
566   BB* last_bb;
567   ThreadId tid;
568   ClgJumpKind jmpkind;
569   Bool isConditionalJump;
570   Int passed = 0, csp;
571   Bool ret_without_call = False;
572   Int popcount_on_return = 1;
573 
574   CLG_DEBUG(3,"+ setup_bbcc(BB %#lx)\n", bb_addr(bb));
575 
576   /* This is needed because thread switches can not reliable be tracked
577    * with callback CLG_(run_thread) only: we have otherwise no way to get
578    * the thread ID after a signal handler returns.
579    * This could be removed again if that bug is fixed in Valgrind.
580    * This is in the hot path but hopefully not to costly.
581    */
582   tid = VG_(get_running_tid)();
583 #if 1
584   /* CLG_(switch_thread) is a no-op when tid is equal to CLG_(current_tid).
585    * As this is on the hot path, we only call CLG_(switch_thread)(tid)
586    * if tid differs from the CLG_(current_tid).
587    */
588   if (UNLIKELY(tid != CLG_(current_tid)))
589      CLG_(switch_thread)(tid);
590 #else
591   CLG_ASSERT(VG_(get_running_tid)() == CLG_(current_tid));
592 #endif
593 
594   sp = VG_(get_SP)(tid);
595   last_bbcc = CLG_(current_state).bbcc;
596   last_bb = last_bbcc ? last_bbcc->bb : 0;
597 
598   if (last_bb) {
599       passed = CLG_(current_state).jmps_passed;
600       CLG_ASSERT(passed <= last_bb->cjmp_count);
601       jmpkind = last_bb->jmp[passed].jmpkind;
602       isConditionalJump = (passed < last_bb->cjmp_count);
603 
604       if (CLG_(current_state).collect) {
605 	if (!CLG_(current_state).nonskipped) {
606 	  last_bbcc->ecounter_sum++;
607 	  last_bbcc->jmp[passed].ecounter++;
608 	  if (!CLG_(clo).simulate_cache) {
609 	      /* update Ir cost */
610               UInt instr_count = last_bb->jmp[passed].instr+1;
611               CLG_(current_state).cost[ fullOffset(EG_IR) ] += instr_count;
612 	  }
613 	}
614 	else {
615 	  /* do not increment exe counter of BBs in skipped functions, as it
616 	   * would fool dumping code */
617 	  if (!CLG_(clo).simulate_cache) {
618 	      /* update Ir cost */
619               UInt instr_count = last_bb->jmp[passed].instr+1;
620               CLG_(current_state).cost[ fullOffset(EG_IR) ] += instr_count;
621               CLG_(current_state).nonskipped->skipped[ fullOffset(EG_IR) ]
622 		+= instr_count;
623 	  }
624 	}
625       }
626 
627       CLG_DEBUGIF(4) {
628 	  CLG_(print_execstate)(-2, &CLG_(current_state) );
629 	  CLG_(print_bbcc_cost)(-2, last_bbcc);
630       }
631   }
632   else {
633       jmpkind = jk_None;
634       isConditionalJump = False;
635   }
636 
637   /* Manipulate JmpKind if needed, only using BB specific info */
638 
639   csp = CLG_(current_call_stack).sp;
640 
641   /* A return not matching the top call in our callstack is a jump */
642   if ( (jmpkind == jk_Return) && (csp >0)) {
643       Int csp_up = csp-1;
644       call_entry* top_ce = &(CLG_(current_call_stack).entry[csp_up]);
645 
646       /* We have a real return if
647        * - the stack pointer (SP) left the current stack frame, or
648        * - SP has the same value as when reaching the current function
649        *   and the address of this BB is the return address of last call
650        *   (we even allow to leave multiple frames if the SP stays the
651        *    same and we find a matching return address)
652        * The latter condition is needed because on PPC, SP can stay
653        * the same over CALL=b(c)l / RET=b(c)lr boundaries
654        */
655       if (sp < top_ce->sp) popcount_on_return = 0;
656       else if (top_ce->sp == sp) {
657 	  while(1) {
658 	      if (top_ce->ret_addr == bb_addr(bb)) break;
659 	      if (csp_up>0) {
660 		  csp_up--;
661 		  top_ce = &(CLG_(current_call_stack).entry[csp_up]);
662 		  if (top_ce->sp == sp) {
663 		      popcount_on_return++;
664 		      continue;
665 		  }
666 	      }
667 	      popcount_on_return = 0;
668 	      break;
669 	  }
670       }
671       if (popcount_on_return == 0) {
672 	  jmpkind = jk_Jump;
673 	  ret_without_call = True;
674       }
675   }
676 
677   /* Should this jump be converted to call or pop/call ? */
678   if (( jmpkind != jk_Return) &&
679       ( jmpkind != jk_Call) && last_bb) {
680 
681     /* We simulate a JMP/Cont to be a CALL if
682      * - jump is in another ELF object or section kind
683      * - jump is to first instruction of a function (tail recursion)
684      */
685     if (ret_without_call ||
686 	/* This is for detection of optimized tail recursion.
687 	 * On PPC, this is only detected as call when going to another
688 	 * function. The problem is that on PPC it can go wrong
689 	 * more easily (no stack frame setup needed)
690 	 */
691 #if defined(VGA_ppc32)
692 	(bb->is_entry && (last_bb->fn != bb->fn)) ||
693 #else
694 	bb->is_entry ||
695 #endif
696 	(last_bb->sect_kind != bb->sect_kind) ||
697 	(last_bb->obj->number != bb->obj->number)) {
698 
699 	CLG_DEBUG(1,"     JMP: %s[%s] to %s[%s]%s!\n",
700 		  last_bb->fn->name, last_bb->obj->name,
701 		  bb->fn->name, bb->obj->name,
702 		  ret_without_call?" (RET w/o CALL)":"");
703 
704 	if (CLG_(get_fn_node)(last_bb)->pop_on_jump && (csp>0)) {
705 
706 	    call_entry* top_ce = &(CLG_(current_call_stack).entry[csp-1]);
707 
708 	    if (top_ce->jcc) {
709 
710 		CLG_DEBUG(1,"     Pop on Jump!\n");
711 
712 		/* change source for delayed push */
713 		CLG_(current_state).bbcc = top_ce->jcc->from;
714 		sp = top_ce->sp;
715 		passed = top_ce->jcc->jmp;
716 		CLG_(pop_call_stack)();
717 	    }
718 	    else {
719 		CLG_ASSERT(CLG_(current_state).nonskipped != 0);
720 	    }
721 	}
722 
723 	jmpkind = jk_Call;
724 	call_emulation = True;
725     }
726   }
727 
728   if (jmpkind == jk_Call)
729     skip = CLG_(get_fn_node)(bb)->skip;
730 
731   CLG_DEBUGIF(1) {
732     if (isConditionalJump)
733       VG_(printf)("Cond-");
734     switch(jmpkind) {
735     case jk_None:   VG_(printf)("Fall-through"); break;
736     case jk_Jump:   VG_(printf)("Jump"); break;
737     case jk_Call:   VG_(printf)("Call"); break;
738     case jk_Return: VG_(printf)("Return"); break;
739     default:        tl_assert(0);
740     }
741     VG_(printf)(" %08lx -> %08lx, SP %08lx\n",
742 		last_bb ? bb_jmpaddr(last_bb) : 0,
743 		bb_addr(bb), sp);
744   }
745 
746   /* Handle CALL/RET and update context to get correct BBCC */
747 
748   if (jmpkind == jk_Return) {
749 
750     if ((csp == 0) ||
751 	((CLG_(current_fn_stack).top > CLG_(current_fn_stack).bottom) &&
752 	 ( *(CLG_(current_fn_stack).top-1)==0)) ) {
753 
754       /* On an empty call stack or at a signal separation marker,
755        * a RETURN generates an call stack underflow.
756        */
757       handleUnderflow(bb);
758       CLG_(pop_call_stack)();
759     }
760     else {
761 	CLG_ASSERT(popcount_on_return >0);
762 	CLG_(unwind_call_stack)(sp, popcount_on_return);
763     }
764   }
765   else {
766     Int unwind_count = CLG_(unwind_call_stack)(sp, 0);
767     if (unwind_count > 0) {
768       /* if unwinding was done, this actually is a return */
769       jmpkind = jk_Return;
770     }
771 
772     if (jmpkind == jk_Call) {
773       delayed_push = True;
774 
775       csp = CLG_(current_call_stack).sp;
776       if (call_emulation && csp>0)
777 	sp = CLG_(current_call_stack).entry[csp-1].sp;
778 
779     }
780   }
781 
782   /* Change new context if needed, taking delayed_push into account */
783   if ((delayed_push && !skip) || (CLG_(current_state).cxt == 0)) {
784     CLG_(push_cxt)(CLG_(get_fn_node)(bb));
785   }
786   CLG_ASSERT(CLG_(current_fn_stack).top > CLG_(current_fn_stack).bottom);
787 
788   /* If there is a fresh instrumented BBCC, assign current context */
789   bbcc = CLG_(get_bbcc)(bb);
790   if (bbcc->cxt == 0) {
791     CLG_ASSERT(bbcc->rec_array == 0);
792 
793     bbcc->cxt = CLG_(current_state).cxt;
794     bbcc->rec_array =
795       new_recursion((*CLG_(current_fn_stack).top)->separate_recursions);
796     bbcc->rec_array[0] = bbcc;
797 
798     insert_bbcc_into_hash(bbcc);
799   }
800   else {
801     /* get BBCC with current context */
802 
803     /* first check LRU of last bbcc executed */
804 
805     if (last_bbcc) {
806       bbcc = last_bbcc->lru_next_bbcc;
807       if (bbcc &&
808 	  ((bbcc->bb != bb) ||
809 	   (bbcc->cxt != CLG_(current_state).cxt)))
810 	bbcc = 0;
811     }
812     else
813       bbcc = 0;
814 
815     if (!bbcc)
816       bbcc = lookup_bbcc(bb, CLG_(current_state).cxt);
817     if (!bbcc)
818       bbcc = clone_bbcc(bb->bbcc_list, CLG_(current_state).cxt, 0);
819 
820     bb->last_bbcc = bbcc;
821   }
822 
823   /* save for fast lookup */
824   if (last_bbcc)
825     last_bbcc->lru_next_bbcc = bbcc;
826 
827   if ((*CLG_(current_fn_stack).top)->separate_recursions >1) {
828     UInt level, idx;
829     fn_node* top = *(CLG_(current_fn_stack).top);
830 
831     level = *CLG_(get_fn_entry)(top->number);
832 
833     if (delayed_push && !skip) {
834       if (CLG_(clo).skip_direct_recursion) {
835         /* a call was detected, which means that the source BB != 0 */
836 	CLG_ASSERT(CLG_(current_state).bbcc != 0);
837 	/* only increment rec. level if called from different function */
838 	if (CLG_(current_state).bbcc->cxt->fn[0] != bbcc->cxt->fn[0])
839 	  level++;
840       }
841       else level++;
842     }
843     if (level> top->separate_recursions)
844       level = top->separate_recursions;
845 
846     if (level == 0) {
847       /* can only happen if instrumentation just was switched on */
848       level = 1;
849       *CLG_(get_fn_entry)(top->number) = 1;
850     }
851 
852     idx = level -1;
853     if (bbcc->rec_array[idx])
854       bbcc = bbcc->rec_array[idx];
855     else
856       bbcc = clone_bbcc(bbcc, CLG_(current_state).cxt, idx);
857 
858     CLG_ASSERT(bbcc->rec_array[bbcc->rec_index] == bbcc);
859   }
860 
861   if (delayed_push) {
862     if (!skip && CLG_(current_state).nonskipped) {
863       /* a call from skipped to nonskipped */
864       CLG_(current_state).bbcc = CLG_(current_state).nonskipped;
865       /* FIXME: take the real passed count from shadow stack */
866       passed = CLG_(current_state).bbcc->bb->cjmp_count;
867     }
868     CLG_(push_call_stack)(CLG_(current_state).bbcc, passed,
869 			 bbcc, sp, skip);
870   }
871 
872   if (CLG_(clo).collect_jumps && (jmpkind == jk_Jump)) {
873 
874     /* Handle conditional jumps followed, i.e. trace arcs
875      * This uses JCC structures, too */
876 
877     jCC* jcc = CLG_(get_jcc)(last_bbcc, passed, bbcc);
878     CLG_ASSERT(jcc != 0);
879     // Change from default, and check if already changed
880     if (jcc->jmpkind == jk_Call)
881       jcc->jmpkind = isConditionalJump ? jk_CondJump : jk_Jump;
882     else {
883 	// FIXME: Why can this fail?
884 	// CLG_ASSERT(jcc->jmpkind == jmpkind);
885     }
886 
887     jcc->call_counter++;
888     if (isConditionalJump)
889       CLG_(stat).jcnd_counter++;
890     else
891       CLG_(stat).jump_counter++;
892   }
893 
894   CLG_(current_state).bbcc = bbcc;
895   /* Even though this will be set in instrumented code directly before
896    * side exits, it needs to be set to 0 here in case an exception
897    * happens in first instructions of the BB */
898   CLG_(current_state).jmps_passed = 0;
899   // needed for log_* handlers called in this BB
900   CLG_(bb_base)   = bb->obj->offset + bb->offset;
901   CLG_(cost_base) = bbcc->cost;
902 
903   CLG_DEBUGIF(1) {
904     VG_(printf)("     ");
905     CLG_(print_bbcc_fn)(bbcc);
906     VG_(printf)("\n");
907   }
908 
909   CLG_DEBUG(3,"- setup_bbcc (BB %#lx): Cost %p (Len %u), Instrs %u (Len %u)\n",
910 	   bb_addr(bb), bbcc->cost, bb->cost_count,
911 	   bb->instr_count, bb->instr_len);
912   CLG_DEBUGIF(3)
913     CLG_(print_cxt)(-8, CLG_(current_state).cxt, bbcc->rec_index);
914   CLG_DEBUG(3,"\n");
915 
916   CLG_(stat).bb_executions++;
917 }
918