1""" NNAPI Systrace parser - tracking of call tree based on trace lines 2 3 See contract-between-code-and-parser.txt for the 4 specification (cases in the specification are referred to with SPEC). 5""" 6 7import re 8import sys 9from parser.naming import (subphases, translate_hidl_mark_to_nn_and_tag, 10 get_function_name_from_mark, make_tag) 11from parser.naming import LAYER_CPU, LAYER_DRIVER, LAYER_RUNTIME, LAYER_APPLICATION 12from parser.naming import MARKER_SWITCH, MARKER_SUBTRACT 13from parser.naming import PHASE_EXECUTION, PHASE_OVERALL, PHASE_WARMUP, PHASE_BENCHMARK 14from parser.tree import SingleThreadCallTree 15 16class AppPhase(object): 17 """ Class to track the overall phase of the program. Used to split up warmup and benchmark. 18 19 Needs to be separate from the call trees to propagate the difference to driver. 20 """ 21 def __init__(self): 22 self.reset() 23 24 def current(self): 25 if self.stack: 26 return self.stack[-1] 27 else: 28 return PHASE_OVERALL 29 30 def push(self, phase): 31 self.stack.append(phase) 32 33 def pop(self): 34 self.stack.pop() 35 36 def reset(self): 37 self.stack = [] 38 39class Tracker(object): 40 """ Class to track the stack trace of a single thread and feed it into a SingleThreadCallTree 41 as well as keeping track of entry and exit times for functions. 42 43 Exposes statistics for a single thread, transforming the call tree as needed. 44 All statistics are in milliseconds. 45 46 Layer Runtime, Phase Execution (LR_PE) is special-cased, see comment in get_stat(). 47 48 Subphases of Execution are aggregated towards the overall Execution phase as needed. 49 """ 50 def __init__(self, tgid, is_driver, app_phase): 51 self.tgid = tgid 52 self.is_driver = is_driver 53 self.app_phase = app_phase 54 55 # Match the trace string 56 # "[NN_LA_PP]funcE1" in "B|<thread1>|[NN_LA_PP]funcE1" 57 # "[NN_LC_PCO]funcC1" in "B|<thread1>|[SW][NN_LC_PCO]funcC1" 58 self.matcher = re.compile(r"B\|\d+\|.*\[([^]]+)\]\[?([^]])\]?") 59 60 self.reset() 61 62 def reset(self): 63 self.stats = {} 64 self.items = {} 65 self.mytree = SingleThreadCallTree() 66 self.begins_and_ends_ms = {} 67 self.la_pe_counts = {} 68 self.debugstring = "\n" 69 70 def handle_mark(self, time, mark): 71 """ Handle a single trace item (scoped entry and exit). 72 Translates: 73 - Automatically generated HIDL traces into NNTRACE layers and phases 74 - SPEC:Switch phase during function into dummy items 75 - SPEC:Subtracting time when nesting is violated into "subtract" 76 markers 77 - CPU/Driver layer distinction based on whether the process is the 78 driver or an application 79 This function is called multiple times for a single application run, 80 afterwards the statistics can be calculated. 81 """ 82 if mark[0] == "B": 83 switch = False 84 subtract = False 85 # Workarounds for wrong tracepoints in early versions 86 # TODO(mikie): remove later 87 if ("ANeuralNetworksEvent_free" in mark) or ("ANeuralNetworksExecution_free" in mark): 88 mark = mark.replace("_PT", "_PE") 89 # Workarounds for trace marker for getSupportedExtensions (fixed in ag/9484333) 90 if ("getSupportedExtensions" in mark): 91 mark = mark.replace("_PC", "_PI") 92 elif ("[SW][NN_LA_PR]executeWithCompilation" in mark): 93 mark = mark.replace("[SW]", "") 94 if MARKER_SWITCH in mark: 95 switch = True 96 if MARKER_SUBTRACT in mark: 97 subtract = True 98 if switch: 99 # End previous item 100 self.handle_mark(time, "E") 101 # Push a dummy item that will get popped by the 'real' end of the 102 # previous item. 103 self.mytree.push_dummy(time) 104 m = self.matcher.search(mark) 105 if m is None: 106 tag = translate_hidl_mark_to_nn_and_tag(mark) 107 if tag is None: 108 raise Exception("Couldn't parse mark " + mark) 109 else: 110 tag = m.group(1) 111 [_, layer, phase] = tag.split("_") 112 if layer == LAYER_APPLICATION and phase in [PHASE_WARMUP, PHASE_BENCHMARK]: 113 self.app_phase.push(phase) 114 if not self.is_driver: 115 layer = layer.replace(LAYER_DRIVER, LAYER_CPU) 116 else: 117 layer = layer.replace(LAYER_CPU, LAYER_DRIVER) 118 if layer == LAYER_APPLICATION and phase == PHASE_EXECUTION: 119 self.la_pe_counts[self.app_phase.current()] = ( 120 self.la_pe_counts.get(self.app_phase.current(), 0) + 1) 121 self.mytree.push(time, mark, layer, phase, self.app_phase.current(), subtract) 122 elif mark[0] == "E": 123 try: 124 node = self.mytree.pop(time) 125 if node.is_dummy(): # Dummy item 126 pass 127 else: 128 if node.layer == LAYER_APPLICATION and node.phase in [PHASE_WARMUP, PHASE_BENCHMARK]: 129 self.app_phase.pop() 130 function = node.app_phase + "::" + get_function_name_from_mark(node.mark) 131 self.begins_and_ends_ms[function] = (self.begins_and_ends_ms.get(function, []) + 132 [[float(node.start_time_s) * 1000.0, 133 float(node.end_time_s) * 1000.0]]) 134 except IndexError as e: 135 raise Exception("Unable to process a trace termination mark, please check that the collected trace are including full application lifecycles.\n") from e 136 137 def is_complete(self): 138 """ Checks if we've seen all end tracepoints for the begin tracepoints. 139 """ 140 return self.mytree.current.is_root() 141 142 def calculate_stats(self): 143 assert self.is_complete() 144 self.mytree.remove_ignored() 145 self.mytree.remove_dummies() 146 self.mytree.copy_subtracted_init_and_wrong_la() 147 self.mytree.add_missing_la_nodes() 148 # self.mytree.print() 149 self.mytree.validate_nesting() 150 151 def recurse(node, prev_layer, prev_phase, indent, in_pe_layers): 152 [begun, mark, layer, phase] = [ 153 node.start_time_s, node.mark, node.layer, node.phase()] 154 time = node.end_time_s 155 tag = None 156 elapsed0 = "DETAIL" 157 elapsed1 = node.elapsed_less_subtracted_ms() 158 if elapsed1 is None: 159 raise Exception("Elapsed for {} returned None".format(node.to_str())) 160 161 if not node.is_added_detail() and not node.subtract: 162 tag = node.app_phase + "_" + layer + "_" + phase 163 elapsed0 = elapsed1 164 self.stats[tag] = self.stats.get(tag, 0.0) + elapsed0 165 self.items[tag] = self.items.get(tag, []) + [ 166 mark + " " + str(elapsed0) + " " + str(elapsed1) + " " + tag] 167 if phase in subphases[PHASE_EXECUTION]: 168 if not in_pe_layers.get(layer): 169 pe_tag = node.app_phase + "_" + make_tag(layer, PHASE_EXECUTION) 170 self.stats[pe_tag] = self.stats.get(pe_tag, 0.0) + elapsed0 171 self.items[pe_tag] = self.items.get(pe_tag, []) + [ 172 mark + " " + str(elapsed0) + " " + str(elapsed1) + " " + pe_tag] 173 if phase == PHASE_EXECUTION: 174 in_pe_layers[layer] = in_pe_layers.get(layer, 0) + 1 175 for c in node.children: 176 recurse(c, layer or prev_layer, phase or prev_phase, 177 indent + " ", in_pe_layers) 178 if phase == PHASE_EXECUTION: 179 in_pe_layers[layer] = in_pe_layers[layer] - 1 180 return 181 182 for top in self.mytree.root.children: 183 recurse(top, None, None, "", {}) 184 self.debugstring = self.mytree.to_str() 185 186 # We need to special case the driver execution time because: 187 # - The existing drivers don't have tracing, so we rely on HIDL traces 188 # - Best we can do is to take the start of the HIDL server side call as 189 # the starting point (which includes a bit of overhead, but not much) and 190 # the start of the callback as the end point (which should be pretty 191 # accurate) 192 # Note that the begin and end may be on different threads, hence the 193 # calculation needs to happen in aggregation rather than here. 194 def get_ld_pe_begins(self, app_phase): 195 return self.get_begins(app_phase, "HIDL::IPreparedModel::execute::server") 196 197 def get_ld_pe_ends(self, app_phase): 198 return self.get_begins(app_phase, "HIDL::IExecutionCallback::notify::client") 199 200 def get_stat(self, tag, app_phase, special_case_pe=True): 201 if not self.stats and not self.mytree.is_empty(): 202 self.calculate_stats() 203 if tag == make_tag(LAYER_RUNTIME, PHASE_EXECUTION) and special_case_pe: 204 # Execution is exposed as an asynchronous event from the runtime, we 205 # calculate the runtime time as starting from when the async operation is 206 # kicked off until wait finishes + synchronous setup and teardown calls. 207 # This has two limitations: 208 # - multithreaded usage will not work correctly 209 # - should the application spend so much time before calling wait that 210 # execution has already finished, the time would get allocated to the 211 # runtime incorrectly 212 async_starts = self.get_begins(app_phase, "ANeuralNetworksExecution_startCompute") 213 async_ends = self.get_ends(app_phase, "ANeuralNetworksEvent_wait") 214 elapsed = 0.0 215 for i in range(0, len(async_starts)): 216 elapsed = elapsed + (async_ends[i] - async_starts[i]) 217 for sync in ["ANeuralNetworksExecution_create", "ANeuralNetworksExecution_free", 218 "ANeuralNetworksEvent_create", "ANeuralNetworksEvent_free", 219 "ANeuralNetworksExecution_setInput", "ANeuralNetworksExecution_setOutput", 220 "ANeuralNetworksExecution_setInputFromMemory", 221 "ANeuralNetworksExecution_setOutputFromMemory"]: 222 sync_starts = self.get_begins(app_phase, sync) 223 sync_ends = self.get_ends(app_phase, sync) 224 for i in range(0, len(sync_starts)): 225 elapsed = elapsed + (sync_ends[i] - sync_starts[i]) 226 return elapsed 227 return self.stats.get(app_phase + "_" + tag, 0.0) 228 229 def get_execution_count(self, app_phase): 230 # ANeuralNetworksExecution_create is reliable and comes from the runtime, 231 # but not available pre-P 232 count = len(self.get_begins(app_phase, "ANeuralNetworksExecution_create")) 233 if count > 0: 234 return count 235 # Application may have added tracepoints 236 return self.la_pe_counts.get(app_phase, 0) 237 238 def get_begins(self, app_phase, function): 239 name = app_phase + "::" + function 240 return [begin_and_end[0] for begin_and_end in self.begins_and_ends_ms.get(name, [])] 241 def get_ends(self, app_phase, function): 242 name = app_phase + "::" + function 243 return [begin_and_end[1] for begin_and_end in self.begins_and_ends_ms.get(name, [])] 244 245 def print_stats(self): 246 if not self.stats: 247 self.calculate_stats() 248 print(self.tgid, "Driver" if self.is_driver else "App") 249 for tag in self.stats: 250 print(tag, self.stats[tag]) 251 if self.items.get(tag): 252 for item in self.items[tag]: 253 print(" ", item) 254 else: 255 print(" ", "calculated only") 256 257 def print(self): 258 self.mytree.print() 259