1 /*
2  * Copyright 2018 Advanced Micro Devices, Inc.
3  * All Rights Reserved.
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9  * and/or sell copies of the Software, and to permit persons to whom the
10  * Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice (including the next
13  * paragraph) shall be included in all copies or substantial portions of the
14  * Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  *
24  */
25 
26 /* This file implements tests on the si_clearbuffer function. */
27 
28 #include "si_pipe.h"
29 #include "si_query.h"
30 
31 #define MIN_SIZE   512
32 #define MAX_SIZE   (128 * 1024 * 1024)
33 #define SIZE_SHIFT 1
34 #define NUM_RUNS   128
35 
get_MBps_rate(unsigned num_bytes,unsigned ns)36 static double get_MBps_rate(unsigned num_bytes, unsigned ns)
37 {
38    return (num_bytes / (1024.0 * 1024.0)) / (ns / 1000000000.0);
39 }
40 
si_test_dma_perf(struct si_screen * sscreen)41 void si_test_dma_perf(struct si_screen *sscreen)
42 {
43    struct pipe_screen *screen = &sscreen->b;
44    struct pipe_context *ctx = screen->context_create(screen, NULL, 0);
45    struct si_context *sctx = (struct si_context *)ctx;
46    const uint32_t clear_value = 0x12345678;
47    static const unsigned cs_dwords_per_thread_list[] = {64, 32, 16, 8, 4, 2, 1};
48    static const unsigned cs_waves_per_sh_list[] = {0, 4, 8, 16};
49 
50 #define NUM_SHADERS ARRAY_SIZE(cs_dwords_per_thread_list)
51 #define NUM_METHODS (4 + 3 * NUM_SHADERS * ARRAY_SIZE(cs_waves_per_sh_list))
52 
53    static const char *method_str[] = {
54       "CP MC   ",
55       "CP L2   ",
56       "CP L2   ",
57       "SDMA    ",
58    };
59    static const char *placement_str[] = {
60       /* Clear */
61       "fill->VRAM",
62       "fill->GTT ",
63       /* Copy */
64       "VRAM->VRAM",
65       "VRAM->GTT ",
66       "GTT ->VRAM",
67    };
68 
69    printf("DMA rate is in MB/s for each size. Slow cases are skipped and print 0.\n");
70    printf("Heap       ,Method  ,L2p,Wa,");
71    for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) {
72       if (size >= 1024)
73          printf("%6uKB,", size / 1024);
74       else
75          printf(" %6uB,", size);
76    }
77    printf("\n");
78 
79    /* results[log2(size)][placement][method][] */
80    struct si_result {
81       bool is_valid;
82       bool is_cp;
83       bool is_sdma;
84       bool is_cs;
85       unsigned cache_policy;
86       unsigned dwords_per_thread;
87       unsigned waves_per_sh;
88       unsigned score;
89       unsigned index; /* index in results[x][y][index] */
90    } results[32][ARRAY_SIZE(placement_str)][NUM_METHODS] = {};
91 
92    /* Run benchmarks. */
93    for (unsigned placement = 0; placement < ARRAY_SIZE(placement_str); placement++) {
94       bool is_copy = placement >= 2;
95 
96       printf("-----------,--------,---,--,");
97       for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT)
98          printf("--------,");
99       printf("\n");
100 
101       for (unsigned method = 0; method < NUM_METHODS; method++) {
102          bool test_cp = method <= 2;
103          bool test_sdma = method == 3;
104          bool test_cs = method >= 4;
105          unsigned cs_method = method - 4;
106          unsigned cs_waves_per_sh =
107             test_cs ? cs_waves_per_sh_list[cs_method / (3 * NUM_SHADERS)] : 0;
108          cs_method %= 3 * NUM_SHADERS;
109          unsigned cache_policy =
110             test_cp ? method % 3 : test_cs ? (cs_method / NUM_SHADERS) : 0;
111          unsigned cs_dwords_per_thread =
112             test_cs ? cs_dwords_per_thread_list[cs_method % NUM_SHADERS] : 0;
113 
114          if (test_sdma && !sctx->sdma_cs)
115             continue;
116 
117          if (sctx->chip_class == GFX6) {
118             /* GFX6 doesn't support CP DMA operations through L2. */
119             if (test_cp && cache_policy != L2_BYPASS)
120                continue;
121             /* WAVES_PER_SH is in multiples of 16 on GFX6. */
122             if (test_cs && cs_waves_per_sh % 16 != 0)
123                continue;
124          }
125 
126          /* SI_RESOURCE_FLAG_UNCACHED setting RADEON_FLAG_UNCACHED doesn't affect
127           * chips before gfx9.
128           */
129          if (test_cs && cache_policy && sctx->chip_class < GFX9)
130             continue;
131 
132          printf("%s ,", placement_str[placement]);
133          if (test_cs) {
134             printf("CS x%-4u,%3s,", cs_dwords_per_thread,
135                    cache_policy == L2_LRU ? "LRU" : cache_policy == L2_STREAM ? "Str" : "");
136          } else {
137             printf("%s,%3s,", method_str[method],
138                    method == L2_LRU ? "LRU" : method == L2_STREAM ? "Str" : "");
139          }
140          if (test_cs && cs_waves_per_sh)
141             printf("%2u,", cs_waves_per_sh);
142          else
143             printf("  ,");
144 
145          void *compute_shader = NULL;
146          if (test_cs) {
147             compute_shader = si_create_dma_compute_shader(ctx, cs_dwords_per_thread,
148                                               cache_policy == L2_STREAM, is_copy);
149          }
150 
151          double score = 0;
152          for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) {
153             /* Don't test bigger sizes if it's too slow. Print 0. */
154             if (size >= 512 * 1024 && score < 400 * (size / (4 * 1024 * 1024))) {
155                printf("%7.0f ,", 0.0);
156                continue;
157             }
158 
159             enum pipe_resource_usage dst_usage, src_usage;
160             struct pipe_resource *dst, *src;
161             unsigned query_type = PIPE_QUERY_TIME_ELAPSED;
162             unsigned flags = cache_policy == L2_BYPASS ? SI_RESOURCE_FLAG_UNCACHED : 0;
163 
164             if (test_sdma) {
165                if (sctx->chip_class == GFX6)
166                   query_type = SI_QUERY_TIME_ELAPSED_SDMA_SI;
167                else
168                   query_type = SI_QUERY_TIME_ELAPSED_SDMA;
169             }
170 
171             if (placement == 0 || placement == 2 || placement == 4)
172                dst_usage = PIPE_USAGE_DEFAULT;
173             else
174                dst_usage = PIPE_USAGE_STREAM;
175 
176             if (placement == 2 || placement == 3)
177                src_usage = PIPE_USAGE_DEFAULT;
178             else
179                src_usage = PIPE_USAGE_STREAM;
180 
181             dst = pipe_aligned_buffer_create(screen, flags, dst_usage, size, 256);
182             src = is_copy ? pipe_aligned_buffer_create(screen, flags, src_usage, size, 256) : NULL;
183 
184             /* Wait for idle before testing, so that other processes don't mess up the results. */
185             sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH |
186                            SI_CONTEXT_FLUSH_AND_INV_CB |
187                            SI_CONTEXT_FLUSH_AND_INV_DB;
188             sctx->emit_cache_flush(sctx);
189 
190             struct pipe_query *q = ctx->create_query(ctx, query_type, 0);
191             ctx->begin_query(ctx, q);
192 
193             /* Run tests. */
194             for (unsigned iter = 0; iter < NUM_RUNS; iter++) {
195                if (test_cp) {
196                   /* CP DMA */
197                   if (is_copy) {
198                      si_cp_dma_copy_buffer(sctx, dst, src, 0, 0, size, 0, SI_COHERENCY_NONE,
199                                            cache_policy);
200                   } else {
201                      si_cp_dma_clear_buffer(sctx, sctx->gfx_cs, dst, 0, size, clear_value, 0,
202                                             SI_COHERENCY_NONE, cache_policy);
203                   }
204                } else if (test_sdma) {
205                   /* SDMA */
206                   if (is_copy) {
207                      si_sdma_copy_buffer(sctx, dst, src, 0, 0, size);
208                   } else {
209                      si_sdma_clear_buffer(sctx, dst, 0, size, clear_value);
210                   }
211                } else {
212                   /* Compute */
213                   /* The memory accesses are coalesced, meaning that the 1st instruction writes
214                    * the 1st contiguous block of data for the whole wave, the 2nd instruction
215                    * writes the 2nd contiguous block of data, etc.
216                    */
217                   unsigned instructions_per_thread = MAX2(1, cs_dwords_per_thread / 4);
218                   unsigned dwords_per_instruction = cs_dwords_per_thread / instructions_per_thread;
219                   unsigned dwords_per_wave = cs_dwords_per_thread * 64;
220 
221                   unsigned num_dwords = size / 4;
222                   unsigned num_instructions = DIV_ROUND_UP(num_dwords, dwords_per_instruction);
223 
224                   struct pipe_grid_info info = {};
225                   info.block[0] = MIN2(64, num_instructions);
226                   info.block[1] = 1;
227                   info.block[2] = 1;
228                   info.grid[0] = DIV_ROUND_UP(num_dwords, dwords_per_wave);
229                   info.grid[1] = 1;
230                   info.grid[2] = 1;
231 
232                   struct pipe_shader_buffer sb[2] = {};
233                   sb[0].buffer = dst;
234                   sb[0].buffer_size = size;
235 
236                   if (is_copy) {
237                      sb[1].buffer = src;
238                      sb[1].buffer_size = size;
239                   } else {
240                      for (unsigned i = 0; i < 4; i++)
241                         sctx->cs_user_data[i] = clear_value;
242                   }
243 
244                   ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, is_copy ? 2 : 1, sb, 0x1);
245                   ctx->bind_compute_state(ctx, compute_shader);
246                   sctx->cs_max_waves_per_sh = cs_waves_per_sh;
247 
248                   ctx->launch_grid(ctx, &info);
249 
250                   ctx->bind_compute_state(ctx, NULL);
251                   sctx->cs_max_waves_per_sh = 0; /* disable the limit */
252                }
253 
254                /* Flush L2, so that we don't just test L2 cache performance except for L2_LRU. */
255                if (!test_sdma) {
256                   sctx->flags |= SI_CONTEXT_INV_VCACHE |
257                                  (cache_policy == L2_LRU ? 0 : SI_CONTEXT_INV_L2) |
258                                  SI_CONTEXT_CS_PARTIAL_FLUSH;
259                   sctx->emit_cache_flush(sctx);
260                }
261             }
262 
263             ctx->end_query(ctx, q);
264             ctx->flush(ctx, NULL, PIPE_FLUSH_ASYNC);
265 
266             pipe_resource_reference(&dst, NULL);
267             pipe_resource_reference(&src, NULL);
268 
269             /* Get results. */
270 
271             union pipe_query_result result;
272 
273             ctx->get_query_result(ctx, q, true, &result);
274             ctx->destroy_query(ctx, q);
275 
276             score = get_MBps_rate(size, result.u64 / (double)NUM_RUNS);
277             printf("%7.0f ,", score);
278             fflush(stdout);
279 
280             struct si_result *r = &results[util_logbase2(size)][placement][method];
281             r->is_valid = true;
282             r->is_cp = test_cp;
283             r->is_sdma = test_sdma;
284             r->is_cs = test_cs;
285             r->cache_policy = cache_policy;
286             r->dwords_per_thread = cs_dwords_per_thread;
287             r->waves_per_sh = cs_waves_per_sh;
288             r->score = score;
289             r->index = method;
290          }
291          puts("");
292 
293          if (compute_shader)
294             ctx->delete_compute_state(ctx, compute_shader);
295       }
296    }
297 
298    puts("");
299    puts("static struct si_method");
300    printf("get_best_clear_for_%s(enum radeon_bo_domain dst, uint64_t size64, bool async, bool "
301           "cached)\n",
302           sctx->screen->info.name);
303    puts("{");
304    puts("   unsigned size = MIN2(size64, UINT_MAX);\n");
305 
306    /* Analyze results and find the best methods. */
307    for (unsigned placement = 0; placement < ARRAY_SIZE(placement_str); placement++) {
308       if (placement == 0)
309          puts("   if (dst == RADEON_DOMAIN_VRAM) {");
310       else if (placement == 1)
311          puts("   } else { /* GTT */");
312       else if (placement == 2) {
313          puts("}");
314          puts("");
315          puts("static struct si_method");
316          printf("get_best_copy_for_%s(enum radeon_bo_domain dst, enum radeon_bo_domain src,\n",
317                 sctx->screen->info.name);
318          printf("                     uint64_t size64, bool async, bool cached)\n");
319          puts("{");
320          puts("   unsigned size = MIN2(size64, UINT_MAX);\n");
321          puts("   if (src == RADEON_DOMAIN_VRAM && dst == RADEON_DOMAIN_VRAM) {");
322       } else if (placement == 3)
323          puts("   } else if (src == RADEON_DOMAIN_VRAM && dst == RADEON_DOMAIN_GTT) {");
324       else
325          puts("   } else { /* GTT -> VRAM */");
326 
327       for (unsigned mode = 0; mode < 3; mode++) {
328          bool async = mode == 0;
329          bool cached = mode == 1;
330 
331          if (async)
332             puts("      if (async) { /* SDMA or async compute */");
333          else if (cached)
334             puts("      if (cached) { /* gfx ring */");
335          else
336             puts("      } else { /* gfx ring - uncached */");
337 
338          /* The list of best chosen methods. */
339          struct si_result *methods[32];
340          unsigned method_max_size[32];
341          unsigned num_methods = 0;
342 
343          for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) {
344             /* Find the best method. */
345             struct si_result *best = NULL;
346 
347             for (unsigned i = 0; i < NUM_METHODS; i++) {
348                struct si_result *r = &results[util_logbase2(size)][placement][i];
349 
350                if (!r->is_valid)
351                   continue;
352 
353                /* Ban CP DMA clears via MC on <= GFX8. They are super slow
354                 * on GTT, which we can get due to BO evictions.
355                 */
356                if (sctx->chip_class <= GFX8 && placement == 1 && r->is_cp &&
357                    r->cache_policy == L2_BYPASS)
358                   continue;
359 
360                if (async) {
361                   /* The following constraints for compute IBs try to limit
362                    * resource usage so as not to decrease the performance
363                    * of gfx IBs too much.
364                    */
365 
366                   /* Don't use CP DMA on asynchronous rings, because
367                    * the engine is shared with gfx IBs.
368                    */
369                   if (r->is_cp)
370                      continue;
371 
372                   /* Don't use L2 caching on asynchronous rings to minimize
373                    * L2 usage.
374                    */
375                   if (r->cache_policy == L2_LRU)
376                      continue;
377 
378                   /* Asynchronous compute recommends waves_per_sh != 0
379                    * to limit CU usage. */
380                   if (r->is_cs && r->waves_per_sh == 0)
381                      continue;
382                } else {
383                   /* SDMA is always asynchronous */
384                   if (r->is_sdma)
385                      continue;
386 
387                   if (cached && r->cache_policy == L2_BYPASS)
388                      continue;
389                   if (!cached && r->cache_policy == L2_LRU)
390                      continue;
391                }
392 
393                if (!best) {
394                   best = r;
395                   continue;
396                }
397 
398                /* Assume some measurement error. Earlier methods occupy fewer
399                 * resources, so the next method is always more greedy, and we
400                 * don't want to select it due to a measurement error.
401                 */
402                double min_improvement = 1.03;
403 
404                if (best->score * min_improvement < r->score)
405                   best = r;
406             }
407 
408             if (num_methods > 0) {
409                unsigned prev_index = num_methods - 1;
410                struct si_result *prev = methods[prev_index];
411                struct si_result *prev_this_size =
412                   &results[util_logbase2(size)][placement][prev->index];
413 
414                /* If the best one is also the best for the previous size,
415                 * just bump the size for the previous one.
416                 *
417                 * If there is no best, it means all methods were too slow
418                 * for this size and were not tested. Use the best one for
419                 * the previous size.
420                 */
421                if (!best ||
422                    /* If it's the same method as for the previous size: */
423                    (prev->is_cp == best->is_cp && prev->is_sdma == best->is_sdma &&
424                     prev->is_cs == best->is_cs && prev->cache_policy == best->cache_policy &&
425                     prev->dwords_per_thread == best->dwords_per_thread &&
426                     prev->waves_per_sh == best->waves_per_sh) ||
427                    /* If the method for the previous size is also the best
428                     * for this size: */
429                    (prev_this_size->is_valid && prev_this_size->score * 1.03 > best->score)) {
430                   method_max_size[prev_index] = size;
431                   continue;
432                }
433             }
434 
435             /* Add it to the list. */
436             assert(num_methods < ARRAY_SIZE(methods));
437             methods[num_methods] = best;
438             method_max_size[num_methods] = size;
439             num_methods++;
440          }
441 
442          for (unsigned i = 0; i < num_methods; i++) {
443             struct si_result *best = methods[i];
444             unsigned size = method_max_size[i];
445 
446             /* The size threshold is between the current benchmarked
447              * size and the next benchmarked size. */
448             if (i < num_methods - 1)
449                printf("         if (size <= %9u) ", (size + (size << SIZE_SHIFT)) / 2);
450             else if (i > 0)
451                printf("         else                   ");
452             else
453                printf("         ");
454             printf("return ");
455 
456             assert(best);
457             const char *cache_policy_str =
458                best->cache_policy == L2_BYPASS ? "L2_BYPASS" :
459                best->cache_policy == L2_LRU ? "L2_LRU   " : "L2_STREAM";
460 
461             if (best->is_cp) {
462                printf("CP_DMA(%s);\n", cache_policy_str);
463             }
464             if (best->is_sdma)
465                printf("SDMA;\n");
466             if (best->is_cs) {
467                printf("COMPUTE(%s, %u, %u);\n", cache_policy_str,
468                       best->dwords_per_thread, best->waves_per_sh);
469             }
470          }
471       }
472       puts("      }");
473    }
474    puts("   }");
475    puts("}");
476 
477    ctx->destroy(ctx);
478    exit(0);
479 }
480