1""" NNAPI Systrace parser - tracking of call tree based on trace lines
2
3    See contract-between-code-and-parser.txt for the
4    specification (cases in the specification are referred to with SPEC).
5"""
6
7import re
8import sys
9from parser.naming import (subphases, translate_hidl_mark_to_nn_and_tag,
10                           get_function_name_from_mark, make_tag)
11from parser.naming import LAYER_CPU, LAYER_DRIVER, LAYER_RUNTIME, LAYER_APPLICATION
12from parser.naming import MARKER_SWITCH, MARKER_SUBTRACT
13from parser.naming import PHASE_EXECUTION, PHASE_OVERALL, PHASE_WARMUP, PHASE_BENCHMARK
14from parser.tree import SingleThreadCallTree
15
16class AppPhase(object):
17  """ Class to track the overall phase of the program. Used to split up warmup and benchmark.
18
19      Needs to be separate from the call trees to propagate the difference to driver.
20  """
21  def __init__(self):
22    self.reset()
23
24  def current(self):
25    if self.stack:
26      return self.stack[-1]
27    else:
28      return PHASE_OVERALL
29
30  def push(self, phase):
31    self.stack.append(phase)
32
33  def pop(self):
34    self.stack.pop()
35
36  def reset(self):
37    self.stack = []
38
39class Tracker(object):
40  """ Class to track the stack trace of a single thread and feed it into a SingleThreadCallTree
41      as well as keeping track of entry and exit times for functions.
42
43      Exposes statistics for a single thread, transforming the call tree as needed.
44      All statistics are in milliseconds.
45
46      Layer Runtime, Phase Execution (LR_PE) is special-cased, see comment in get_stat().
47
48      Subphases of Execution are aggregated towards the overall Execution phase as needed.
49  """
50  def __init__(self, tgid, is_driver, app_phase):
51    self.tgid = tgid
52    self.is_driver = is_driver
53    self.app_phase = app_phase
54
55    # Match the trace string
56    # "[NN_LA_PP]funcE1" in "B|<thread1>|[NN_LA_PP]funcE1"
57    # "[NN_LC_PCO]funcC1" in "B|<thread1>|[SW][NN_LC_PCO]funcC1"
58    self.matcher = re.compile(r"B\|\d+\|.*\[([^]]+)\]\[?([^]])\]?")
59
60    self.reset()
61
62  def reset(self):
63    self.stats = {}
64    self.items = {}
65    self.mytree = SingleThreadCallTree()
66    self.begins_and_ends_ms = {}
67    self.la_pe_counts = {}
68    self.debugstring = "\n"
69
70  def handle_mark(self, time, mark):
71    """ Handle a single trace item (scoped entry and exit).
72        Translates:
73          - Automatically generated HIDL traces into NNTRACE layers and phases
74          - SPEC:Switch phase during function into dummy items
75          - SPEC:Subtracting time when nesting is violated into "subtract"
76            markers
77          - CPU/Driver layer distinction based on whether the process is the
78            driver or an application
79        This function is called multiple times for a single application run,
80        afterwards the statistics can be calculated.
81    """
82    if mark[0] == "B":
83      switch = False
84      subtract = False
85      # Workarounds for wrong tracepoints in early versions
86      # TODO(mikie): remove later
87      if ("ANeuralNetworksEvent_free" in mark) or ("ANeuralNetworksExecution_free" in mark):
88        mark = mark.replace("_PT", "_PE")
89      # Workarounds for trace marker for getSupportedExtensions (fixed in ag/9484333)
90      if ("getSupportedExtensions" in mark):
91        mark = mark.replace("_PC", "_PI")
92      elif ("[SW][NN_LA_PR]executeWithCompilation" in mark):
93        mark = mark.replace("[SW]", "")
94      if MARKER_SWITCH in mark:
95        switch = True
96      if MARKER_SUBTRACT in mark:
97        subtract = True
98      if switch:
99        # End previous item
100        self.handle_mark(time, "E")
101        # Push a dummy item that will get popped by the 'real' end of the
102        # previous item.
103        self.mytree.push_dummy(time)
104      m = self.matcher.search(mark)
105      if m is None:
106        tag = translate_hidl_mark_to_nn_and_tag(mark)
107        if tag is None:
108          raise Exception("Couldn't parse mark " + mark)
109      else:
110        tag = m.group(1)
111      [_, layer, phase] = tag.split("_")
112      if layer == LAYER_APPLICATION and phase in [PHASE_WARMUP, PHASE_BENCHMARK]:
113        self.app_phase.push(phase)
114      if not self.is_driver:
115        layer = layer.replace(LAYER_DRIVER, LAYER_CPU)
116      else:
117        layer = layer.replace(LAYER_CPU, LAYER_DRIVER)
118      if layer == LAYER_APPLICATION and phase == PHASE_EXECUTION:
119        self.la_pe_counts[self.app_phase.current()] = (
120            self.la_pe_counts.get(self.app_phase.current(), 0) + 1)
121      self.mytree.push(time, mark, layer, phase, self.app_phase.current(), subtract)
122    elif mark[0] == "E":
123      try:
124        node = self.mytree.pop(time)
125        if node.is_dummy():  # Dummy item
126          pass
127        else:
128          if node.layer == LAYER_APPLICATION and node.phase in [PHASE_WARMUP, PHASE_BENCHMARK]:
129            self.app_phase.pop()
130          function = node.app_phase + "::" + get_function_name_from_mark(node.mark)
131          self.begins_and_ends_ms[function] = (self.begins_and_ends_ms.get(function, []) +
132                                               [[float(node.start_time_s) * 1000.0,
133                                                 float(node.end_time_s) * 1000.0]])
134      except IndexError as e:
135        raise Exception("Unable to process a trace termination mark, please check that the collected trace are including full application lifecycles.\n") from e
136
137  def is_complete(self):
138    """ Checks if we've seen all end tracepoints for the begin tracepoints.
139    """
140    return self.mytree.current.is_root()
141
142  def calculate_stats(self):
143    assert self.is_complete()
144    self.mytree.remove_ignored()
145    self.mytree.remove_dummies()
146    self.mytree.copy_subtracted_init_and_wrong_la()
147    self.mytree.add_missing_la_nodes()
148    # self.mytree.print()
149    self.mytree.validate_nesting()
150
151    def recurse(node, prev_layer, prev_phase, indent, in_pe_layers):
152      [begun, mark, layer, phase] = [
153          node.start_time_s, node.mark, node.layer, node.phase()]
154      time = node.end_time_s
155      tag = None
156      elapsed0 = "DETAIL"
157      elapsed1 = node.elapsed_less_subtracted_ms()
158      if elapsed1 is None:
159        raise Exception("Elapsed for {} returned None".format(node.to_str()))
160
161      if not node.is_added_detail() and not node.subtract:
162        tag = node.app_phase + "_" + layer + "_" + phase
163        elapsed0 = elapsed1
164        self.stats[tag] = self.stats.get(tag, 0.0) + elapsed0
165        self.items[tag] = self.items.get(tag, []) + [
166            mark + " " + str(elapsed0) + " " + str(elapsed1) + " " +  tag]
167        if phase in subphases[PHASE_EXECUTION]:
168          if not in_pe_layers.get(layer):
169            pe_tag = node.app_phase + "_" + make_tag(layer, PHASE_EXECUTION)
170            self.stats[pe_tag] = self.stats.get(pe_tag, 0.0) + elapsed0
171            self.items[pe_tag] = self.items.get(pe_tag, []) + [
172                mark + " " + str(elapsed0) + " " + str(elapsed1) + " " +  pe_tag]
173      if phase == PHASE_EXECUTION:
174        in_pe_layers[layer] = in_pe_layers.get(layer, 0) + 1
175      for c in node.children:
176        recurse(c, layer or prev_layer, phase or prev_phase,
177                indent + "  ", in_pe_layers)
178      if phase == PHASE_EXECUTION:
179        in_pe_layers[layer] = in_pe_layers[layer] - 1
180      return
181
182    for top in self.mytree.root.children:
183      recurse(top, None, None, "", {})
184    self.debugstring = self.mytree.to_str()
185
186  # We need to special case the driver execution time because:
187  # - The existing drivers don't have tracing, so we rely on HIDL traces
188  # - Best we can do is to take the start of the HIDL server side call as
189  #   the starting point (which includes a bit of overhead, but not much) and
190  #   the start of the callback as the end point (which should be pretty
191  #   accurate)
192  # Note that the begin and end may be on different threads, hence the
193  # calculation needs to happen in aggregation rather than here.
194  def get_ld_pe_begins(self, app_phase):
195      return self.get_begins(app_phase, "HIDL::IPreparedModel::execute::server")
196
197  def get_ld_pe_ends(self, app_phase):
198      return self.get_begins(app_phase, "HIDL::IExecutionCallback::notify::client")
199
200  def get_stat(self, tag, app_phase, special_case_pe=True):
201    if not self.stats and not self.mytree.is_empty():
202      self.calculate_stats()
203    if tag == make_tag(LAYER_RUNTIME, PHASE_EXECUTION) and special_case_pe:
204      # Execution is exposed as an asynchronous event from the runtime, we
205      # calculate the runtime time as starting from when the async operation is
206      # kicked off until wait finishes + synchronous setup and teardown calls.
207      # This has two limitations:
208      #   - multithreaded usage will not work correctly
209      #   - should the application spend so much time before calling wait that
210      #     execution has already finished, the time would get allocated to the
211      #     runtime incorrectly
212      async_starts = self.get_begins(app_phase, "ANeuralNetworksExecution_startCompute")
213      async_ends = self.get_ends(app_phase, "ANeuralNetworksEvent_wait")
214      elapsed = 0.0
215      for i in range(0, len(async_starts)):
216        elapsed = elapsed + (async_ends[i] - async_starts[i])
217      for sync in ["ANeuralNetworksExecution_create", "ANeuralNetworksExecution_free",
218                   "ANeuralNetworksEvent_create", "ANeuralNetworksEvent_free",
219                   "ANeuralNetworksExecution_setInput", "ANeuralNetworksExecution_setOutput",
220                   "ANeuralNetworksExecution_setInputFromMemory",
221                   "ANeuralNetworksExecution_setOutputFromMemory"]:
222        sync_starts = self.get_begins(app_phase, sync)
223        sync_ends = self.get_ends(app_phase, sync)
224        for i in range(0, len(sync_starts)):
225          elapsed = elapsed + (sync_ends[i] - sync_starts[i])
226      return elapsed
227    return self.stats.get(app_phase + "_" + tag, 0.0)
228
229  def get_execution_count(self, app_phase):
230    # ANeuralNetworksExecution_create is reliable and comes from the runtime,
231    # but not available pre-P
232    count = len(self.get_begins(app_phase, "ANeuralNetworksExecution_create"))
233    if count > 0:
234      return count
235    # Application may have added tracepoints
236    return self.la_pe_counts.get(app_phase, 0)
237
238  def get_begins(self, app_phase, function):
239    name = app_phase + "::" + function
240    return [begin_and_end[0] for begin_and_end in self.begins_and_ends_ms.get(name, [])]
241  def get_ends(self, app_phase, function):
242    name = app_phase + "::" + function
243    return [begin_and_end[1] for begin_and_end in self.begins_and_ends_ms.get(name, [])]
244
245  def print_stats(self):
246    if not self.stats:
247      self.calculate_stats()
248    print(self.tgid, "Driver" if self.is_driver else "App")
249    for tag in self.stats:
250      print(tag, self.stats[tag])
251      if self.items.get(tag):
252        for item in self.items[tag]:
253          print("  ", item)
254      else:
255        print("  ", "calculated only")
256
257  def print(self):
258    self.mytree.print()
259