1 /* Copyright 2021 Google LLC. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #ifndef RUY_RUY_TRACE_H_
17 #define RUY_RUY_TRACE_H_
18 
19 #ifdef RUY_TRACE
20 
21 #include <algorithm>
22 #include <cstdio>
23 #include <cstdlib>
24 #include <memory>
25 #include <mutex>
26 #include <string>
27 #include <thread>
28 #include <vector>
29 
30 #include "ruy/mat.h"
31 #include "ruy/matrix.h"
32 #include "ruy/path.h"
33 #include "ruy/platform.h"
34 #include "ruy/side_pair.h"
35 
36 namespace ruy {
37 
38 // Helper for `formatted` so we don't have to put .c_str() on strings.
39 template <typename T>
value_for_snprintf(T value)40 T value_for_snprintf(T value) {
41   return value;
42 }
43 
value_for_snprintf(const std::string & s)44 inline const char* value_for_snprintf(const std::string& s) {
45   return s.c_str();
46 }
47 
48 // A sprintf-like function returning a std::string.
49 // Remove this once we can rely on std::format (c++20).
50 template <typename... Args>
formatted(const char * format,Args...args)51 std::string formatted(const char* format, Args... args) {
52   char buf[1024];
53 #pragma GCC diagnostic push
54 #pragma GCC diagnostic warning "-Wformat-security"
55   int size = snprintf(buf, sizeof buf, format, value_for_snprintf(args)...);
56 #pragma GCC diagnostic pop
57   if (size <= 0) {
58     abort();
59   }
60   return std::string(buf);
61 }
62 
63 // An entry in the trace.
64 struct ThreadTraceEntry final {
65   std::string text;
66   int indent = 0;
67   const char* source_file = nullptr;
68   int source_line = 0;
69 };
70 
71 // Trace for one thread.
72 class ThreadTrace final {
73  public:
~ThreadTrace()74   ~ThreadTrace() {}
75 
set_thread_id(int thread_id)76   void set_thread_id(int thread_id) { thread_id_ = thread_id; }
thread_id()77   int thread_id() const { return thread_id_; }
78 
is_in_run_ahead_packing_loop()79   bool is_in_run_ahead_packing_loop() const {
80     return is_in_run_ahead_packing_loop_;
81   }
set_is_in_run_ahead_packing_loop(bool value)82   void set_is_in_run_ahead_packing_loop(bool value) {
83     is_in_run_ahead_packing_loop_ = value;
84   }
85 
set_current_source_file(const char * source_file)86   void set_current_source_file(const char* source_file) {
87     current_source_file_ = source_file;
88   }
89 
set_current_source_line(int source_line)90   void set_current_source_line(int source_line) {
91     current_source_line_ = source_line;
92   }
93 
entries()94   const std::vector<ThreadTraceEntry>& entries() const { return entries_; }
95 
96   template <typename... Args>
Write(const char * format,Args...args)97   void Write(const char* format, Args... args) {
98     ThreadTraceEntry entry;
99     entry.text = formatted(format, args...);
100     entry.indent = indent_;
101     entry.source_file = current_source_file_;
102     entry.source_line = current_source_line_;
103     entries_.emplace_back(std::move(entry));
104   }
105 
106   template <typename... Args>
EnterScope(const char * scope_name)107   void EnterScope(const char* scope_name) {
108     Write("%s {", scope_name);
109     indent_++;
110   }
LeaveScope(const char * scope_name)111   void LeaveScope(const char* scope_name) {
112     indent_--;
113     Write("}  // end of %s", scope_name);
114   }
115 
116  private:
117   // The trace contents
118   std::vector<ThreadTraceEntry> entries_;
119 
120   // Current indentation level.
121   int indent_ = 0;
122   // Thread's ID as set by Ruy, e.g. [0,N-1]. Not OS TID.
123   int thread_id_ = -1;
124   // The run-ahead loop in `EnsurePacked` may run many iterations when the
125   // thread is waiting for a block to be packed by another thread --- it's
126   // a busy wait. We track whether we are in that mode to avoid generating
127   // many uninteresting trace entries.
128   bool is_in_run_ahead_packing_loop_ = false;
129   // Last recorded value of __FILE__ and __LINE__, as a convenience so we don't
130   // have to pass these in every call to `Write`.
131   const char* current_source_file_ = nullptr;
132   int current_source_line_ = 0;
133 };
134 
135 // Main components of ruy. Used for trace colorization.
136 enum class Component { kNone, kFrontEnd, kMiddleEnd, kBackEnd, kThreadPool };
137 
138 // Output format for the trace.
139 enum class TraceOutputFormat { kNone, kTerminal, kHtml };
140 
IndentString(int indent)141 inline std::string IndentString(int indent) {
142   std::string s;
143   for (int i = 0; i < indent; i++) {
144     s += "  ";
145   }
146   return s;
147 }
148 
149 // Returns the text to write to the trace to open a colored section.
ColorSectionStart(TraceOutputFormat output_format,Component component)150 inline const char* ColorSectionStart(TraceOutputFormat output_format,
151                                      Component component) {
152   switch (output_format) {
153     case TraceOutputFormat::kTerminal:
154       switch (component) {
155         case Component::kFrontEnd:
156           return "\x1b[36m";
157         case Component::kMiddleEnd:
158           return "\x1b[32m";
159         case Component::kBackEnd:
160           return "\x1b[31m";
161         case Component::kThreadPool:
162           return "\x1b[33m";
163         default:
164           abort();
165           return nullptr;
166       }
167     case TraceOutputFormat::kHtml:
168       switch (component) {
169         case Component::kFrontEnd:
170           return "<span style=\"background-color:#B2EBF2\">";
171         case Component::kMiddleEnd:
172           return "<span style=\"background-color:#C8E6C9\">";
173         case Component::kBackEnd:
174           return "<span style=\"background-color:#FFCDD2\">";
175         case Component::kThreadPool:
176           return "<span style=\"background-color:#FFF9C4\">";
177         default:
178           abort();
179           return nullptr;
180       }
181     default:
182       abort();
183       return nullptr;
184   }
185 }
186 
187 // Returns the text to write to the trace to close a colored section.
ColorSectionEnd(TraceOutputFormat output_format)188 inline const char* ColorSectionEnd(TraceOutputFormat output_format) {
189   switch (output_format) {
190     case TraceOutputFormat::kTerminal:
191       return "\x1b[0m";
192     case TraceOutputFormat::kHtml:
193       return "</span>";
194     default:
195       abort();
196       return nullptr;
197   }
198 }
199 
200 // Returns the output format to use for the trace.
GetOutputFormat()201 inline TraceOutputFormat GetOutputFormat() {
202   const char* html_env = getenv("RUY_TRACE_HTML");
203   if (html_env && strtol(html_env, nullptr, 10) != 0) {
204     return TraceOutputFormat::kHtml;
205   } else {
206     return TraceOutputFormat::kTerminal;
207   }
208 }
209 
210 // A `basename` function that's good enough for ruy __FILE__'s.
211 // Note: `basename` is POSIX-only and annoying (takes a char*, may mutate).
GetBaseName(const char * path)212 inline const char* GetBaseName(const char* path) {
213   std::size_t len = strlen(path);
214   if (len == 0) {
215     return path;
216   }
217   const char* ptr = path + len - 1;
218   while (ptr != path) {
219     if (*ptr == '/' || *ptr == '\\') {
220       return ptr + 1;
221     }
222     --ptr;
223   }
224   // Path did not contain any path separator.
225   return path;
226 }
227 
228 // Determines a Component (used for colorization) by source file.
GetComponentBySourceFile(const char * base_name)229 inline Component GetComponentBySourceFile(const char* base_name) {
230   if (!strcmp(base_name, "pack.h") || !strcmp(base_name, "kernel.h")) {
231     return Component::kBackEnd;
232   } else if (!strcmp(base_name, "trmul.cc") ||
233              !strcmp(base_name, "block_map.cc")) {
234     return Component::kMiddleEnd;
235   } else if (!strcmp(base_name, "thread_pool.cc")) {
236     return Component::kThreadPool;
237   } else {
238     return Component::kFrontEnd;
239   }
240 }
241 
EscapeText(TraceOutputFormat output_format,const std::string & text)242 inline std::string EscapeText(TraceOutputFormat output_format,
243                               const std::string& text) {
244   if (output_format == TraceOutputFormat::kHtml) {
245     std::string escaped_text;
246     for (char c : text) {
247       if (c == '<') {
248         escaped_text += "&lt;";
249       } else if (c == '>') {
250         escaped_text += "&gt;";
251       } else {
252         escaped_text += c;
253       }
254     }
255     return escaped_text;
256   } else {
257     return text;
258   }
259 }
260 
261 // Prints an entry from the trace to the destination trace file.
Print(const ThreadTraceEntry & entry,TraceOutputFormat output_format,FILE * file)262 inline void Print(const ThreadTraceEntry& entry,
263                   TraceOutputFormat output_format, FILE* file) {
264   const char* base_name = GetBaseName(entry.source_file);
265   Component component = GetComponentBySourceFile(base_name);
266   const std::string& source_location =
267       formatted("%s:%d", base_name, entry.source_line);
268   const std::string& escaped_text = EscapeText(output_format, entry.text);
269   fprintf(file, "%s%-32s%s%s%s\n", ColorSectionStart(output_format, component),
270           source_location.c_str(), IndentString(entry.indent).c_str(),
271           escaped_text.c_str(), ColorSectionEnd(output_format));
272 }
273 
274 // Prints a thread's entire trace to the destination trace file.
Print(const ThreadTrace & trace,TraceOutputFormat output_format,FILE * file)275 inline void Print(const ThreadTrace& trace, TraceOutputFormat output_format,
276                   FILE* file) {
277   if (output_format == TraceOutputFormat::kHtml) {
278     fprintf(file, "<html><body><pre>\n<span style=\"font-weight:bold\">\n");
279   }
280   fprintf(file, "Ruy trace for thread %d:\n", trace.thread_id());
281   if (output_format == TraceOutputFormat::kHtml) {
282     fprintf(file, "</span>\n");
283   }
284   for (const ThreadTraceEntry& entry : trace.entries()) {
285     Print(entry, output_format, file);
286   }
287   fprintf(file, "\n");
288   if (output_format == TraceOutputFormat::kHtml) {
289     fprintf(file, "</pre></body></html>\n");
290   }
291 }
292 
293 // Holds all the threads' traces. This is a global singleton class.
294 // On exit, when the singleton is destroyed, the destructor prints out the
295 // traces.
296 class AllThreadTraces final {
297  public:
298   // Add a new ThreadTrace for the current thread. Should be called only once
299   // on each thread.
AddCurrentThread()300   ThreadTrace* AddCurrentThread() {
301     std::lock_guard<std::mutex> lock(mutex_);
302     ThreadTrace* thread_trace = new ThreadTrace;
303     thread_traces_.emplace_back(thread_trace);
304     return thread_trace;
305   }
~AllThreadTraces()306   ~AllThreadTraces() {
307     std::lock_guard<std::mutex> lock(mutex_);
308     // Open the destination file.
309     const char* file_env = getenv("RUY_TRACE_FILE");
310     FILE* file = stdout;
311     if (file_env) {
312       file = fopen(file_env, "w");
313       if (!file) {
314         fprintf(stderr, "Failed to open %s for write\n", file_env);
315         exit(1);
316       }
317     }
318     // Sort the threads by Ruy Thread ID (not OS TID).
319     auto output_format = GetOutputFormat();
320     std::sort(std::begin(thread_traces_), std::end(thread_traces_),
321               [](const auto& a, const auto& b) {
322                 return a->thread_id() < b->thread_id();
323               });
324     // Print all the threads' traces.
325     for (const auto& trace : thread_traces_) {
326       Print(*trace, output_format, file);
327     }
328     if (file_env) {
329       fclose(file);
330     }
331   }
Singleton()332   static AllThreadTraces* Singleton() {
333     static AllThreadTraces all_thread_traces;
334     return &all_thread_traces;
335   }
336 
337  private:
338   std::vector<std::unique_ptr<ThreadTrace>> thread_traces_;
339   std::mutex mutex_;
340 };
341 
342 // Returns the thread-local ThreadTrace singleton, constructing it as needed.
ThreadLocalTrace()343 inline ThreadTrace* ThreadLocalTrace() {
344   static thread_local ThreadTrace* thread_local_trace =
345       AllThreadTraces::Singleton()->AddCurrentThread();
346   return thread_local_trace;
347 }
348 
349 // RAII helper to trace a scope, e.g. a function scope.
350 class RuyTraceScope {
351   const char* source_file_;
352   int source_line_;
353   const char* scope_name_;
354 
355  public:
RuyTraceScope(const char * source_file,int source_line,const char * scope_name)356   RuyTraceScope(const char* source_file, int source_line,
357                 const char* scope_name)
358       : source_file_(source_file),
359         source_line_(source_line),
360         scope_name_(scope_name) {
361     ThreadLocalTrace()->set_current_source_file(source_file_);
362     ThreadLocalTrace()->set_current_source_line(source_line_);
363     ThreadLocalTrace()->EnterScope(scope_name_);
364   }
~RuyTraceScope()365   ~RuyTraceScope() {
366     ThreadLocalTrace()->set_current_source_file(source_file_);
367     ThreadLocalTrace()->set_current_source_line(source_line_);
368     ThreadLocalTrace()->LeaveScope(scope_name_);
369   }
370 };
371 
372 #define RUY_TRACE_SCOPE_NAME_IMPL(file, line, name) \
373   RuyTraceScope ruy_trace_scope##line(file, line, name)
374 #define RUY_TRACE_SCOPE_NAME(name) \
375   RUY_TRACE_SCOPE_NAME_IMPL(__FILE__, __LINE__, name)
376 #define RUY_TRACE_SCOPE \
377   RUY_TRACE_SCOPE_NAME_IMPL(__FILE__, __LINE__, __FUNCTION__)
378 
379 // Helpers to trace Ruy objects.
380 
str(Order o)381 inline std::string str(Order o) {
382   return o == Order::kRowMajor ? "row-major" : "column-major";
383 }
384 
str(Side s)385 inline std::string str(Side s) { return s == Side::kLhs ? "LHS" : "RHS"; }
386 
str(const Layout & layout)387 inline std::string str(const Layout& layout) {
388   std::string s =
389       formatted("%dx%d, %s", layout.rows(), layout.cols(), str(layout.order()));
390   int inner_size =
391       layout.order() == Order::kRowMajor ? layout.cols() : layout.rows();
392   if (inner_size != layout.stride()) {
393     s += formatted(", stride=%d", layout.stride());
394   } else {
395     s += formatted(", unstrided");
396   }
397   return s;
398 }
399 
str(const MatLayout & layout)400 inline std::string str(const MatLayout& layout) {
401   std::string s =
402       formatted("%dx%d, %s", layout.rows, layout.cols, str(layout.order));
403   int inner_size = layout.order == Order::kRowMajor ? layout.cols : layout.rows;
404   if (inner_size != layout.stride) {
405     s += formatted(", stride=%d", layout.stride);
406   } else {
407     s += formatted(", unstrided");
408   }
409   return s;
410 }
411 
str(const PMatLayout & layout)412 inline std::string str(const PMatLayout& layout) {
413   std::string s =
414       formatted("%dx%d, %s", layout.rows, layout.cols, str(layout.order));
415   int inner_size = layout.order == Order::kRowMajor ? layout.cols : layout.rows;
416   if (inner_size != layout.stride) {
417     s += formatted(", stride=%d", layout.stride);
418   } else {
419     s += formatted(", unstrided");
420   }
421   s += formatted(", kernel blocks: %dx%d %s", layout.kernel.rows,
422                  layout.kernel.cols, str(layout.kernel.order));
423   return s;
424 }
425 
426 template <typename T>
str()427 std::string str() {
428   return "<unknown type>";
429 }
430 #define RUY_IMPL_STR_TYPE_STD(T)     \
431   template <>                        \
432   inline std::string str<std::T>() { \
433     return #T;                       \
434   }
435 #define RUY_IMPL_STR_TYPE(T)    \
436   template <>                   \
437   inline std::string str<T>() { \
438     return #T;                  \
439   }
440 
441 RUY_IMPL_STR_TYPE(float)
RUY_IMPL_STR_TYPE(double)442 RUY_IMPL_STR_TYPE(double)
443 RUY_IMPL_STR_TYPE_STD(int8_t)
444 RUY_IMPL_STR_TYPE_STD(uint8_t)
445 RUY_IMPL_STR_TYPE_STD(int16_t)
446 RUY_IMPL_STR_TYPE_STD(uint16_t)
447 RUY_IMPL_STR_TYPE_STD(int32_t)
448 RUY_IMPL_STR_TYPE_STD(uint32_t)
449 RUY_IMPL_STR_TYPE_STD(int64_t)
450 RUY_IMPL_STR_TYPE_STD(uint64_t)
451 
452 template <typename T>
453 std::string str(const Matrix<T>& matrix) {
454   std::string s = formatted("Matrix<%s>, %s", str<T>(), str(matrix.layout()));
455   if (matrix.zero_point()) {
456     s += formatted(", zero_point=%d", static_cast<int>(matrix.zero_point()));
457   }
458   if (matrix.cache_policy() != CachePolicy::kNeverCache) {
459     s +=
460         formatted(", cache_policy=%d", static_cast<int>(matrix.cache_policy()));
461   }
462   return s;
463 }
464 
str(const Type & type)465 inline std::string str(const Type& type) {
466   char c;
467   if (type.is_floating_point) {
468     c = 'f';
469   } else if (type.is_signed) {
470     c = 'i';
471   } else {
472     c = 'u';
473   }
474   return formatted("%c%d", c, type.size * 8);
475 }
476 
str(const EMat & mat)477 inline std::string str(const EMat& mat) {
478   std::string s =
479       formatted("EMat, data_type=%s, %s", str(mat.data_type), str(mat.layout));
480   if (mat.zero_point) {
481     s += formatted(", zero_point=%d", static_cast<int>(mat.zero_point));
482   }
483   if (mat.cache_policy != CachePolicy::kNeverCache) {
484     s += formatted(", cache_policy=%d", static_cast<int>(mat.cache_policy));
485   }
486   return s;
487 }
488 
str(const PEMat & mat)489 inline std::string str(const PEMat& mat) {
490   std::string s =
491       formatted("PEMat, data_type=%s, %s", str(mat.data_type), str(mat.layout));
492   if (mat.zero_point) {
493     s += formatted(", zero_point=%d", static_cast<int>(mat.zero_point));
494   }
495   return s;
496 }
497 
str(Path paths)498 inline std::string str(Path paths) {
499   bool first = true;
500   std::string s;
501   for (int bit = 0; bit < 16; bit++) {
502     Path cur_path = static_cast<Path>(1 << bit);
503     if ((paths & cur_path) != Path::kNone) {
504       if (!first) {
505         s += " | ";
506       }
507       first = false;
508       switch (cur_path) {
509         case Path::kNone:
510           continue;
511 #define RUY_HANDLE_PATH(p) \
512   case Path::p:            \
513     s += #p;               \
514     break;
515           RUY_HANDLE_PATH(kStandardCpp)
516           RUY_HANDLE_PATH(kInternalStandardCppVariant1)
517           RUY_HANDLE_PATH(kInternalStandardCppVariant2)
518           RUY_HANDLE_PATH(kInternalStandardCppVariant3)
519 #if RUY_PLATFORM_ARM
520           RUY_HANDLE_PATH(kNeon)
521           RUY_HANDLE_PATH(kNeonDotprod)
522 #endif  // RUY_PLATFORM_ARM
523 #if RUY_PLATFORM_X86
524           RUY_HANDLE_PATH(kAvx)
525           RUY_HANDLE_PATH(kAvx2Fma)
526           RUY_HANDLE_PATH(kAvx512)
527 #endif  // RUY_PLATFORM_X86
528 #undef RUY_HANDLE_PATH
529         default:
530           fprintf(stderr, "Unhandled Path value 0x%x\n",
531                   static_cast<int>(cur_path));
532           abort();
533       }
534     }
535   }
536   return s;
537 }
538 
539 // Implementation of RUY_TRACE_INFO(X) macros.
540 
541 #define RUY_TRACE_INFO_MUL                                            \
542   ThreadLocalTrace()->Write("CompiledPaths: %s", str(CompiledPaths)); \
543   ThreadLocalTrace()->Write("LHS: %s", str(lhs));                     \
544   ThreadLocalTrace()->Write("RHS: %s", str(rhs));                     \
545   ThreadLocalTrace()->Write("Destination: %s", str(*dst));
546 
547 #define RUY_TRACE_INFO_CREATE_TRMUL_PARAMS_TRANSPOSING                      \
548   ThreadLocalTrace()->Write("Canonicalizing to column-major destination:"); \
549   ThreadLocalTrace()->Write(                                                \
550       "Swapping LHS<->RHS and flipping all storage orders.");
551 
552 #define RUY_TRACE_INFO_CREATE_TRMUL_PARAMS_ASSUMING_COLMAJOR_DST         \
553   ThreadLocalTrace()->Write("Runtime-selected path: %s", str(the_path)); \
554   ThreadLocalTrace()->Write("LHS: %s", str(params->src[Side::kLhs]));    \
555   ThreadLocalTrace()->Write("RHS: %s", str(params->src[Side::kRhs]));    \
556   ThreadLocalTrace()->Write("Destination: %s", str(params->dst));
557 
558 #define RUY_TRACE_INFO_POPULATE_TRMUL_PARAMS                                \
559   ThreadLocalTrace()->Write(                                                \
560       "Here we have this Path as a template parameter: %s", str(ThePath));  \
561   ThreadLocalTrace()->Write("PackedLhsScalar: %s", str<PackedLhsScalar>()); \
562   ThreadLocalTrace()->Write("PackedRhsScalar: %s", str<PackedRhsScalar>()); \
563   ThreadLocalTrace()->Write("Kernel function pointer: %p",                  \
564                             params->run_kernel);                            \
565   ThreadLocalTrace()->Write("Kernel LHS block layout: %dx%d %s",            \
566                             LhsKernelLayout::kRows, LhsKernelLayout::kCols, \
567                             str(LhsKernelLayout::kOrder));                  \
568   ThreadLocalTrace()->Write("Kernel RHS block layout: %dx%d %s",            \
569                             RhsKernelLayout::kRows, RhsKernelLayout::kCols, \
570                             str(RhsKernelLayout::kOrder));                  \
571   ThreadLocalTrace()->Write("Created packed matrices:");                    \
572   ThreadLocalTrace()->Write("Packed LHS matrix: %s",                        \
573                             str(params->packed_matrix[Side::kLhs]));        \
574   ThreadLocalTrace()->Write("Packed RHS matrix: %s",                        \
575                             str(params->packed_matrix[Side::kRhs]));        \
576   ThreadLocalTrace()->Write("LHS packing function pointer: %p",             \
577                             params->run_pack[Side::kLhs]);                  \
578   ThreadLocalTrace()->Write("RHS packing function pointer: %p",             \
579                             params->run_pack[Side::kRhs]);
580 
581 #define RUY_TRACE_INFO_GET_RUNTIME_ENABLED_PATHS_USING_SET_VALUE        \
582   ThreadLocalTrace()->Write("SetRuntimeEnabledPaths forcing paths: %s", \
583                             str(*paths));
584 
585 #define RUY_TRACE_INFO_GET_RUNTIME_ENABLED_PATHS_USING_ENV_VAR        \
586   ThreadLocalTrace()->Write("Environment variable forcing paths: %s", \
587                             str(*paths));
588 
589 #define RUY_TRACE_INFO_GET_RUNTIME_ENABLED_PATHS_USING_DETECTION \
590   ThreadLocalTrace()->Write(                                     \
591       "Runtime-detected paths: %s",                              \
592       str(*paths & ~kNonArchPathsIncludingInternalVariants));
593 
594 #define RUY_TRACE_INFO_PREPARE_PACKED_MATRICES_SHOULD_CACHE            \
595   ThreadLocalTrace()->Write(                                           \
596       "Caching the packed %s matrix. Already in cache: %s", str(side), \
597       action == PrepackedCache::Action::kInsertedNewEntry ? "no" : "yes");
598 
599 #define RUY_TRACE_INFO_PREPARE_PACKED_MATRICES_NO_CACHE \
600   ThreadLocalTrace()->Write("Not caching the packed %s matrix.", str(side));
601 
602 #define RUY_TRACE_INFO_GET_TENTATIVE_THREAD_COUNT                          \
603   ThreadLocalTrace()->Write(                                               \
604       "tentative_thread_count=%d (determined based on shape %dx%dx%d and " \
605       "max_num_threads=%d)",                                               \
606       tentative_thread_count, rows, depth, cols, ctx->max_num_threads());
607 
608 #define RUY_TRACE_INFO_GET_USE_SIMPLE_LOOP_RETURNS_TRUE                    \
609   ThreadLocalTrace()->Write(                                               \
610       "Choosing to use the simple loop code path in TrMul because of the " \
611       "linear traversal and single thread.");
612 
613 #define RUY_TRACE_INFO_GET_USE_SIMPLE_LOOP_RETURNS_FALSE                    \
614   ThreadLocalTrace()->Write(                                                \
615       "Choosing to use the general case code path in TrMul because of: %s", \
616       tentative_thread_count > 1 ? "multi-threading"                        \
617                                  : "non-linear traversal order");
618 
619 #define RUY_TRACE_INFO_TRMUL_SIMPLE_LOOP \
620   ThreadLocalTrace()->Write("Entering the simple loop code path of TrMul");
621 
622 #define RUY_TRACE_INFO_TRMUL_GENERAL_CASE \
623   ThreadLocalTrace()->Write("Entering the general case code path of TrMul");
624 
625 #define RUY_TRACE_INFO_MAKE_BLOCK_MAP_START                                    \
626   ThreadLocalTrace()->Write("Kernel block: %dx%d", kernel_rows, kernel_cols);  \
627   ThreadLocalTrace()->Write(                                                   \
628       "BlockMap shape: %dx%d (destination matrix shape rounded to next "       \
629       "kernel blocks)",                                                        \
630       rows, cols);                                                             \
631   ThreadLocalTrace()->Write(                                                   \
632       "Rectangularness log2: %dx%d (powers of two factors bringing the shape " \
633       "closest to square)",                                                    \
634       rows_rectangularness_log2, cols_rectangularness_log2);                   \
635   ThreadLocalTrace()->Write("Accumulation depth: %d", depth);                  \
636   ThreadLocalTrace()->Write("LHS scalar type size: %d", lhs_scalar_size);      \
637   ThreadLocalTrace()->Write("RHS scalar type size: %d", rhs_scalar_size);      \
638   ThreadLocalTrace()->Write("Tentative thread count: %d",                      \
639                             tentative_thread_count);                           \
640   ThreadLocalTrace()->Write(                                                   \
641       "CPU cache params: local_cache_size=%d, last_level_cache_size=%d",       \
642       cpu_cache_params.local_cache_size,                                       \
643       cpu_cache_params.last_level_cache_size);                                 \
644   ThreadLocalTrace()->Write(                                                   \
645       "For the sizes below, when rows!=cols, we always retain the min of the " \
646       "two.");                                                                 \
647   ThreadLocalTrace()->Write("Kernel block size_log2: %d", kernel_size_log2);   \
648   ThreadLocalTrace()->Write(                                                   \
649       "BlockMap size_log2: %d (destination matrix shape rounded to next "      \
650       "kernel blocks)",                                                        \
651       size_log2);                                                              \
652   ThreadLocalTrace()->Write(                                                   \
653       "Now we will pick the optimal log2 of BlockMap block size");
654 
655 #define RUY_TRACE_INFO_MAKE_BLOCK_MAP_EACH_TENTATIVE_BLOCK_SIZE           \
656   ThreadLocalTrace()->Write(                                              \
657       "For BlockMap block size_log2 %d, score=%d ("                       \
658       "multithreading_score=%d + cache_locality_score=%d + "              \
659       "kernel_amortization_score=%d)",                                    \
660       block_size_log2, score, multithreading_score, cache_locality_score, \
661       kernel_amortization_score);
662 
663 #define RUY_TRACE_INFO_MAKE_BLOCK_MAP_END                                      \
664   ThreadLocalTrace()->Write("Selecting BlockMap block size_log2: %d",          \
665                             best_score_block_size_log2);                       \
666   ThreadLocalTrace()->Write(                                                   \
667       "BlockMap has %dx%d blocks, each of size between %dx%d and %dx%d.",      \
668       1 << num_blocks_of_rows_log2, 1 << num_blocks_of_cols_log2,              \
669       block_map->small_block_dims[Side::kLhs],                                 \
670       block_map->small_block_dims[Side::kRhs],                                 \
671       block_map->small_block_dims[Side::kLhs] +                                \
672           block_map->kernel_dims[Side::kLhs],                                  \
673       block_map->small_block_dims[Side::kRhs] +                                \
674           block_map->kernel_dims[Side::kRhs]);                                 \
675   ThreadLocalTrace()->Write(                                                   \
676       "The first %d rows of blocks have %d rows, the remaining ones have %d "  \
677       "rows ",                                                                 \
678       block_map->large_blocks[Side::kLhs],                                     \
679       block_map->small_block_dims[Side::kLhs] +                                \
680           block_map->kernel_dims[Side::kLhs],                                  \
681       block_map->small_block_dims[Side::kLhs]);                                \
682   ThreadLocalTrace()->Write(                                                   \
683       "The first %d columns of blocks have %d columns, the remaining ones "    \
684       "have %d columns ",                                                      \
685       block_map->large_blocks[Side::kRhs],                                     \
686       block_map->small_block_dims[Side::kRhs] +                                \
687           block_map->kernel_dims[Side::kLhs],                                  \
688       block_map->small_block_dims[Side::kRhs]);                                \
689   ThreadLocalTrace()->Write(                                                   \
690       "Traversal order: %s",                                                   \
691       block_map->traversal_order == BlockMapTraversalOrder::kLinear ? "linear" \
692       : block_map->traversal_order == BlockMapTraversalOrder::kFractalZ        \
693           ? "fractal Z-curve"                                                  \
694       : block_map->traversal_order == BlockMapTraversalOrder::kFractalU        \
695           ? "fractal U-curve"                                                  \
696       : block_map->traversal_order == BlockMapTraversalOrder::kFractalHilbert  \
697           ? "fractal Hilbert curve"                                            \
698           : nullptr);                                                          \
699   ThreadLocalTrace()->Write("Finalized thread count: %d",                      \
700                             block_map->thread_count);
701 
702 #define RUY_TRACE_SET_THEAD_ID(thread_id) \
703   ThreadLocalTrace()->set_thread_id(thread_id);
704 
705 #define RUY_TRACE_INFO_TRMUL_TASK_MAIN_LOOP_GOT_BLOCK_COORDS              \
706   ThreadLocalTrace()->Write(                                              \
707       "Block #%d is at position (%d, %d) in the BlockMap.", block_id,     \
708       block[Side::kLhs], block[Side::kRhs]);                              \
709   ThreadLocalTrace()->Write(                                              \
710       "Block #%d has shape %dx%d and starts at position (%d, %d) in the " \
711       "destination matrix.",                                              \
712       block_id, end[Side::kLhs] - start[Side::kLhs],                      \
713       end[Side::kRhs] - start[Side::kRhs], start[Side::kLhs],             \
714       start[Side::kRhs]);                                                 \
715   ThreadLocalTrace()->Write(                                              \
716       "Block #%d depends on LHS panel #%d and RHS panel #%d.", block_id,  \
717       block[Side::kLhs], block[Side::kRhs]);
718 
719 #define RUY_TRACE_INFO_TRYPACK_PACKING                                  \
720   ThreadLocalTrace()->Write(                                            \
721       "%s panel #%d is not already packed. Packing it now.", str(side), \
722       block);
723 
724 #define RUY_TRACE_INFO_TRYPACK_ANOTHER_THREAD_PACKING                \
725   if (!ThreadLocalTrace()->is_in_run_ahead_packing_loop()) {         \
726     ThreadLocalTrace()->Write(                                       \
727         "%s panel #%d is currently being packed by another thread.", \
728         str(side), block);                                           \
729   }
730 
731 #define RUY_TRACE_INFO_TRYPACK_PREVIOUSLY_PACKED                          \
732   if (!ThreadLocalTrace()->is_in_run_ahead_packing_loop()) {              \
733     ThreadLocalTrace()->Write("%s panel #%d had previously been packed.", \
734                               str(side), block);                          \
735   }
736 
737 #define RUY_TRACE_INFO_TRYPACK_PACKED_BY_ANOTHER_THREAD                  \
738   ThreadLocalTrace()->Write(                                             \
739       "%s panel #%d has just been packed by another thread.", str(side), \
740       block);
741 
742 #define RUY_TRACE_INFO_ENSURE_PACKED_ENTER_RUN_AHEAD                       \
743   if (!ThreadLocalTrace()->is_in_run_ahead_packing_loop()) {               \
744     ThreadLocalTrace()->set_is_in_run_ahead_packing_loop(true);            \
745     ThreadLocalTrace()->Write(                                             \
746         "We're blocked on other threads packing the panels that we need. " \
747         "Packing some other panels while we wait...");                     \
748   }
749 
750 #define RUY_TRACE_INFO_ENSURE_PACKED_END                                  \
751   if (ThreadLocalTrace()->is_in_run_ahead_packing_loop()) {               \
752     ThreadLocalTrace()->set_is_in_run_ahead_packing_loop(false);          \
753     ThreadLocalTrace()->Write(                                            \
754         "Other threads have finished packing what we were waiting for."); \
755   }
756 
757 #define RUY_TRACE_INFO_RUN_PACK                                               \
758   ThreadLocalTrace()->Write("Path: %s", str(ThePath));                        \
759   ThreadLocalTrace()->Write("Packing panel consisting of columns [%d, %d)",   \
760                             start_col, end_col);                              \
761   ThreadLocalTrace()->Write("Source: columns [%d, %d) of %s", start_col,      \
762                             end_col, str(src_matrix));                        \
763   ThreadLocalTrace()->Write("Destination: columns [%d, %d) of %s", start_col, \
764                             end_col, str(*packed_matrix));                    \
765   if (end_col > src_matrix.layout.cols) {                                     \
766     ThreadLocalTrace()->Write(                                                \
767         "This runs past the last column of the source matrix. Padding as "    \
768         "needed.");                                                           \
769   }                                                                           \
770   if (packed_matrix->layout.rows > src_matrix.layout.rows) {                  \
771     ThreadLocalTrace()->Write(                                                \
772         "The packed matrix has more rows than the source matrix due to "      \
773         "rounding up to the kernel block size. Padding as needed.");          \
774   }
775 
776 #define RUY_TRACE_INFO_RUN_KERNEL                                              \
777   {                                                                            \
778     ThreadLocalTrace()->Write("Path: %s", str(KernelArgs<KernelType>::kPath)); \
779     int lhs_cols = end[Side::kLhs] - start[Side::kLhs];                        \
780     int rhs_cols = end[Side::kRhs] - start[Side::kRhs];                        \
781     int kernel_lhs_cols = src[Side::kLhs].layout.kernel.cols;                  \
782     int kernel_rhs_cols = src[Side::kRhs].layout.kernel.cols;                  \
783     ThreadLocalTrace()->Write("LHS: columns [%d, %d) of %s",                   \
784                               start[Side::kLhs], end[Side::kLhs],              \
785                               str(src[Side::kLhs]));                           \
786     ThreadLocalTrace()->Write("RHS: columns [%d, %d) of %s",                   \
787                               start[Side::kRhs], end[Side::kRhs],              \
788                               str(src[Side::kRhs]));                           \
789     ThreadLocalTrace()->Write("Destination: block [%d, %d)x[%d, %d) of %s",    \
790                               start[Side::kLhs], end[Side::kLhs],              \
791                               start[Side::kRhs], end[Side::kRhs], str(*dst));  \
792     if (end[Side::kLhs] > dst->layout.rows ||                                  \
793         end[Side::kRhs] > dst->layout.cols) {                                  \
794       ThreadLocalTrace()->Write(                                               \
795           "This runs over the destination matrix boundaries. The kernel will " \
796           "internally clamp stores to avoid overruns.");                       \
797     }                                                                          \
798     ThreadLocalTrace()->Write(                                                 \
799         "The kernel's inner loop only produces a %dx%d block, so the "         \
800         "kernel's outer loops will run %dx%d times.",                          \
801         kernel_lhs_cols, kernel_rhs_cols, lhs_cols / kernel_lhs_cols,          \
802         rhs_cols / kernel_rhs_cols);                                           \
803   }
804 
805 #define RUY_TRACE_INFO_THREAD_FUNC_IMPL_WAITING \
806   ThreadLocalTrace()->Write("Waiting for a task...");
807 
808 #define RUY_TRACE_INFO_THREADPOOL_EXECUTE_STARTING_TASK \
809   ThreadLocalTrace()->Write("Sending task #%d to a worker thread...", i);
810 
811 #define RUY_TRACE_INFO_THREADPOOL_EXECUTE_STARTING_TASK_ZERO_ON_CUR_THREAD \
812   ThreadLocalTrace()->Write("Running task #0 on the current thread...");
813 
814 #define RUY_TRACE_INFO_THREADPOOL_EXECUTE_WAITING_FOR_THREADS \
815   ThreadLocalTrace()->Write("Waiting for worker threads to finish..");
816 
817 #define RUY_TRACE_INFO(id)                                 \
818   [=]() {                                                  \
819     ThreadLocalTrace()->set_current_source_file(__FILE__); \
820     ThreadLocalTrace()->set_current_source_line(__LINE__); \
821     RUY_TRACE_INFO_##id                                    \
822   }()
823 
824 }  // namespace ruy
825 
826 #else
827 
828 // Vacuous implementation when RUY_TRACE is not defined.
829 #define RUY_TRACE_SCOPE_NAME(name)
830 #define RUY_TRACE_SCOPE
831 #define RUY_TRACE_SET_THEAD_ID(thread_id)
832 #define RUY_TRACE_INFO(id)
833 
834 #endif
835 
836 #endif  // RUY_RUY_TRACE_H_
837