1 /*
2  Copyright (C) Intel Corp.  2006.  All Rights Reserved.
3  Intel funded Tungsten Graphics to
4  develop this 3D driver.
5 
6  Permission is hereby granted, free of charge, to any person obtaining
7  a copy of this software and associated documentation files (the
8  "Software"), to deal in the Software without restriction, including
9  without limitation the rights to use, copy, modify, merge, publish,
10  distribute, sublicense, and/or sell copies of the Software, and to
11  permit persons to whom the Software is furnished to do so, subject to
12  the following conditions:
13 
14  The above copyright notice and this permission notice (including the
15  next paragraph) shall be included in all copies or substantial
16  portions of the Software.
17 
18  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21  IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 
26  **********************************************************************/
27  /*
28   * Authors:
29   *   Keith Whitwell <keithw@vmware.com>
30   */
31 
32 /** @file brw_program_cache.c
33  *
34  * This file implements a simple program cache for 965.  The consumers can
35  *  query the hash table of programs using a cache_id and program key, and
36  * receive the corresponding program buffer object (plus associated auxiliary
37  *  data) in return.  Objects in the cache may not have relocations
38  * (pointers to other BOs) in them.
39  *
40  * The inner workings are a simple hash table based on a CRC of the
41  * key data.
42  *
43  * Replacement is not implemented.  Instead, when the cache gets too
44  * big we throw out all of the cache data and let it get regenerated.
45  */
46 
47 #include "main/imports.h"
48 #include "main/streaming-load-memcpy.h"
49 #include "x86/common_x86_asm.h"
50 #include "intel_batchbuffer.h"
51 #include "brw_state.h"
52 #include "brw_wm.h"
53 #include "brw_gs.h"
54 #include "brw_cs.h"
55 #include "brw_program.h"
56 #include "compiler/brw_eu.h"
57 
58 #define FILE_DEBUG_FLAG DEBUG_STATE
59 
60 struct brw_cache_item {
61    /**
62     * Effectively part of the key, cache_id identifies what kind of state
63     * buffer is involved, and also which dirty flag should set.
64     */
65    enum brw_cache_id cache_id;
66 
67    /** 32-bit hash of the key data */
68    GLuint hash;
69 
70    /** for variable-sized keys */
71    GLuint key_size;
72    GLuint prog_data_size;
73    const void *key;
74 
75    uint32_t offset;
76    uint32_t size;
77 
78    struct brw_cache_item *next;
79 };
80 
81 static unsigned
get_program_string_id(enum brw_cache_id cache_id,const void * key)82 get_program_string_id(enum brw_cache_id cache_id, const void *key)
83 {
84    switch (cache_id) {
85    case BRW_CACHE_VS_PROG:
86       return ((struct brw_vs_prog_key *) key)->program_string_id;
87    case BRW_CACHE_TCS_PROG:
88       return ((struct brw_tcs_prog_key *) key)->program_string_id;
89    case BRW_CACHE_TES_PROG:
90       return ((struct brw_tes_prog_key *) key)->program_string_id;
91    case BRW_CACHE_GS_PROG:
92       return ((struct brw_gs_prog_key *) key)->program_string_id;
93    case BRW_CACHE_CS_PROG:
94       return ((struct brw_cs_prog_key *) key)->program_string_id;
95    case BRW_CACHE_FS_PROG:
96       return ((struct brw_wm_prog_key *) key)->program_string_id;
97    default:
98       unreachable("no program string id for this kind of program");
99    }
100 }
101 
102 static GLuint
hash_key(struct brw_cache_item * item)103 hash_key(struct brw_cache_item *item)
104 {
105    GLuint *ikey = (GLuint *)item->key;
106    GLuint hash = item->cache_id, i;
107 
108    assert(item->key_size % 4 == 0);
109 
110    /* I'm sure this can be improved on:
111     */
112    for (i = 0; i < item->key_size/4; i++) {
113       hash ^= ikey[i];
114       hash = (hash << 5) | (hash >> 27);
115    }
116 
117    return hash;
118 }
119 
120 static int
brw_cache_item_equals(const struct brw_cache_item * a,const struct brw_cache_item * b)121 brw_cache_item_equals(const struct brw_cache_item *a,
122                       const struct brw_cache_item *b)
123 {
124    return a->cache_id == b->cache_id &&
125       a->hash == b->hash &&
126       a->key_size == b->key_size &&
127       (memcmp(a->key, b->key, a->key_size) == 0);
128 }
129 
130 static struct brw_cache_item *
search_cache(struct brw_cache * cache,GLuint hash,struct brw_cache_item * lookup)131 search_cache(struct brw_cache *cache, GLuint hash,
132              struct brw_cache_item *lookup)
133 {
134    struct brw_cache_item *c;
135 
136 #if 0
137    int bucketcount = 0;
138 
139    for (c = cache->items[hash % cache->size]; c; c = c->next)
140       bucketcount++;
141 
142    fprintf(stderr, "bucket %d/%d = %d/%d items\n", hash % cache->size,
143            cache->size, bucketcount, cache->n_items);
144 #endif
145 
146    for (c = cache->items[hash % cache->size]; c; c = c->next) {
147       if (brw_cache_item_equals(lookup, c))
148          return c;
149    }
150 
151    return NULL;
152 }
153 
154 
155 static void
rehash(struct brw_cache * cache)156 rehash(struct brw_cache *cache)
157 {
158    struct brw_cache_item **items;
159    struct brw_cache_item *c, *next;
160    GLuint size, i;
161 
162    size = cache->size * 3;
163    items = calloc(size, sizeof(*items));
164 
165    for (i = 0; i < cache->size; i++)
166       for (c = cache->items[i]; c; c = next) {
167          next = c->next;
168          c->next = items[c->hash % size];
169          items[c->hash % size] = c;
170       }
171 
172    free(cache->items);
173    cache->items = items;
174    cache->size = size;
175 }
176 
177 
178 /**
179  * Returns the buffer object matching cache_id and key, or NULL.
180  */
181 bool
brw_search_cache(struct brw_cache * cache,enum brw_cache_id cache_id,const void * key,GLuint key_size,uint32_t * inout_offset,void * inout_prog_data)182 brw_search_cache(struct brw_cache *cache,
183                  enum brw_cache_id cache_id,
184                  const void *key, GLuint key_size,
185                  uint32_t *inout_offset, void *inout_prog_data)
186 {
187    struct brw_context *brw = cache->brw;
188    struct brw_cache_item *item;
189    struct brw_cache_item lookup;
190    GLuint hash;
191 
192    lookup.cache_id = cache_id;
193    lookup.key = key;
194    lookup.key_size = key_size;
195    hash = hash_key(&lookup);
196    lookup.hash = hash;
197 
198    item = search_cache(cache, hash, &lookup);
199 
200    if (item == NULL)
201       return false;
202 
203    void *prog_data = ((char *) item->key) + item->key_size;
204 
205    if (item->offset != *inout_offset ||
206        prog_data != *((void **) inout_prog_data)) {
207       brw->ctx.NewDriverState |= (1 << cache_id);
208       *inout_offset = item->offset;
209       *((void **) inout_prog_data) = prog_data;
210    }
211 
212    return true;
213 }
214 
215 static void
brw_cache_new_bo(struct brw_cache * cache,uint32_t new_size)216 brw_cache_new_bo(struct brw_cache *cache, uint32_t new_size)
217 {
218    struct brw_context *brw = cache->brw;
219    struct brw_bo *new_bo;
220 
221    perf_debug("Copying to larger program cache: %u kB -> %u kB\n",
222               (unsigned) cache->bo->size / 1024, new_size / 1024);
223 
224    new_bo = brw_bo_alloc(brw->bufmgr, "program cache", new_size, 64);
225    if (can_do_exec_capture(brw->screen))
226       new_bo->kflags = EXEC_OBJECT_CAPTURE;
227 
228    void *map = brw_bo_map(brw, new_bo, MAP_READ | MAP_WRITE |
229                                        MAP_ASYNC | MAP_PERSISTENT);
230 
231    /* Copy any existing data that needs to be saved. */
232    if (cache->next_offset != 0) {
233 #ifdef USE_SSE41
234       if (!cache->bo->cache_coherent && cpu_has_sse4_1)
235          _mesa_streaming_load_memcpy(map, cache->map, cache->next_offset);
236       else
237 #endif
238          memcpy(map, cache->map, cache->next_offset);
239    }
240 
241    brw_bo_unmap(cache->bo);
242    brw_bo_unreference(cache->bo);
243    cache->bo = new_bo;
244    cache->map = map;
245 
246    /* Since we have a new BO in place, we need to signal the units
247     * that depend on it (state base address on gen5+, or unit state before).
248     */
249    brw->ctx.NewDriverState |= BRW_NEW_PROGRAM_CACHE;
250    brw->batch.state_base_address_emitted = false;
251 }
252 
253 /**
254  * Attempts to find an item in the cache with identical data.
255  */
256 static const struct brw_cache_item *
brw_lookup_prog(const struct brw_cache * cache,enum brw_cache_id cache_id,const void * data,unsigned data_size)257 brw_lookup_prog(const struct brw_cache *cache,
258                 enum brw_cache_id cache_id,
259                 const void *data, unsigned data_size)
260 {
261    unsigned i;
262    const struct brw_cache_item *item;
263 
264    for (i = 0; i < cache->size; i++) {
265       for (item = cache->items[i]; item; item = item->next) {
266          if (item->cache_id != cache_id || item->size != data_size ||
267              memcmp(cache->map + item->offset, data, item->size) != 0)
268             continue;
269 
270          return item;
271       }
272    }
273 
274    return NULL;
275 }
276 
277 static uint32_t
brw_alloc_item_data(struct brw_cache * cache,uint32_t size)278 brw_alloc_item_data(struct brw_cache *cache, uint32_t size)
279 {
280    uint32_t offset;
281 
282    /* Allocate space in the cache BO for our new program. */
283    if (cache->next_offset + size > cache->bo->size) {
284       uint32_t new_size = cache->bo->size * 2;
285 
286       while (cache->next_offset + size > new_size)
287          new_size *= 2;
288 
289       brw_cache_new_bo(cache, new_size);
290    }
291 
292    offset = cache->next_offset;
293 
294    /* Programs are always 64-byte aligned, so set up the next one now */
295    cache->next_offset = ALIGN(offset + size, 64);
296 
297    return offset;
298 }
299 
300 const void *
brw_find_previous_compile(struct brw_cache * cache,enum brw_cache_id cache_id,unsigned program_string_id)301 brw_find_previous_compile(struct brw_cache *cache,
302                           enum brw_cache_id cache_id,
303                           unsigned program_string_id)
304 {
305    for (unsigned i = 0; i < cache->size; i++) {
306       for (struct brw_cache_item *c = cache->items[i]; c; c = c->next) {
307          if (c->cache_id == cache_id &&
308              get_program_string_id(cache_id, c->key) == program_string_id) {
309             return c->key;
310          }
311       }
312    }
313 
314    return NULL;
315 }
316 
317 void
brw_upload_cache(struct brw_cache * cache,enum brw_cache_id cache_id,const void * key,GLuint key_size,const void * data,GLuint data_size,const void * prog_data,GLuint prog_data_size,uint32_t * out_offset,void * out_prog_data)318 brw_upload_cache(struct brw_cache *cache,
319                  enum brw_cache_id cache_id,
320                  const void *key,
321                  GLuint key_size,
322                  const void *data,
323                  GLuint data_size,
324                  const void *prog_data,
325                  GLuint prog_data_size,
326                  uint32_t *out_offset,
327                  void *out_prog_data)
328 {
329    struct brw_cache_item *item = CALLOC_STRUCT(brw_cache_item);
330    const struct brw_cache_item *matching_data =
331       brw_lookup_prog(cache, cache_id, data, data_size);
332    GLuint hash;
333    void *tmp;
334 
335    item->cache_id = cache_id;
336    item->size = data_size;
337    item->key = key;
338    item->key_size = key_size;
339    item->prog_data_size = prog_data_size;
340    hash = hash_key(item);
341    item->hash = hash;
342 
343    /* If we can find a matching prog in the cache already, then reuse the
344     * existing stuff without creating new copy into the underlying buffer
345     * object. This is notably useful for programs generating shaders at
346     * runtime, where multiple shaders may compile to the same thing in our
347     * backend.
348     */
349    if (matching_data) {
350       item->offset = matching_data->offset;
351    } else {
352       item->offset = brw_alloc_item_data(cache, data_size);
353 
354       /* Copy data to the buffer */
355       memcpy(cache->map + item->offset, data, data_size);
356    }
357 
358    /* Set up the memory containing the key and prog_data */
359    tmp = malloc(key_size + prog_data_size);
360 
361    memcpy(tmp, key, key_size);
362    memcpy(tmp + key_size, prog_data, prog_data_size);
363 
364    item->key = tmp;
365 
366    if (cache->n_items > cache->size * 1.5f)
367       rehash(cache);
368 
369    hash %= cache->size;
370    item->next = cache->items[hash];
371    cache->items[hash] = item;
372    cache->n_items++;
373 
374    *out_offset = item->offset;
375    *(void **)out_prog_data = (void *)((char *)item->key + item->key_size);
376    cache->brw->ctx.NewDriverState |= 1 << cache_id;
377 }
378 
379 void
brw_init_caches(struct brw_context * brw)380 brw_init_caches(struct brw_context *brw)
381 {
382    struct brw_cache *cache = &brw->cache;
383 
384    cache->brw = brw;
385 
386    cache->size = 7;
387    cache->n_items = 0;
388    cache->items =
389       calloc(cache->size, sizeof(struct brw_cache_item *));
390 
391    cache->bo = brw_bo_alloc(brw->bufmgr, "program cache", 16384, 64);
392    if (can_do_exec_capture(brw->screen))
393       cache->bo->kflags = EXEC_OBJECT_CAPTURE;
394 
395    cache->map = brw_bo_map(brw, cache->bo, MAP_READ | MAP_WRITE |
396                                            MAP_ASYNC | MAP_PERSISTENT);
397 }
398 
399 static void
brw_clear_cache(struct brw_context * brw,struct brw_cache * cache)400 brw_clear_cache(struct brw_context *brw, struct brw_cache *cache)
401 {
402    struct brw_cache_item *c, *next;
403    GLuint i;
404 
405    DBG("%s\n", __func__);
406 
407    for (i = 0; i < cache->size; i++) {
408       for (c = cache->items[i]; c; c = next) {
409          next = c->next;
410          if (c->cache_id == BRW_CACHE_VS_PROG ||
411              c->cache_id == BRW_CACHE_TCS_PROG ||
412              c->cache_id == BRW_CACHE_TES_PROG ||
413              c->cache_id == BRW_CACHE_GS_PROG ||
414              c->cache_id == BRW_CACHE_FS_PROG ||
415              c->cache_id == BRW_CACHE_CS_PROG) {
416             const void *item_prog_data = c->key + c->key_size;
417             brw_stage_prog_data_free(item_prog_data);
418          }
419          free((void *)c->key);
420          free(c);
421       }
422       cache->items[i] = NULL;
423    }
424 
425    cache->n_items = 0;
426 
427    /* Start putting programs into the start of the BO again, since
428     * we'll never find the old results.
429     */
430    cache->next_offset = 0;
431 
432    /* We need to make sure that the programs get regenerated, since
433     * any offsets leftover in brw_context will no longer be valid.
434     */
435    brw->NewGLState = ~0;
436    brw->ctx.NewDriverState = ~0ull;
437    brw->state.pipelines[BRW_RENDER_PIPELINE].mesa = ~0;
438    brw->state.pipelines[BRW_RENDER_PIPELINE].brw = ~0ull;
439    brw->state.pipelines[BRW_COMPUTE_PIPELINE].mesa = ~0;
440    brw->state.pipelines[BRW_COMPUTE_PIPELINE].brw = ~0ull;
441 
442    /* Also, NULL out any stale program pointers. */
443    brw->vs.base.prog_data = NULL;
444    brw->tcs.base.prog_data = NULL;
445    brw->tes.base.prog_data = NULL;
446    brw->gs.base.prog_data = NULL;
447    brw->wm.base.prog_data = NULL;
448    brw->cs.base.prog_data = NULL;
449 
450    intel_batchbuffer_flush(brw);
451 }
452 
453 void
brw_program_cache_check_size(struct brw_context * brw)454 brw_program_cache_check_size(struct brw_context *brw)
455 {
456    /* un-tuned guess.  Each object is generally a page, so 2000 of them is 8 MB of
457     * state cache.
458     */
459    if (brw->cache.n_items > 2000) {
460       perf_debug("Exceeded state cache size limit.  Clearing the set "
461                  "of compiled programs, which will trigger recompiles\n");
462       brw_clear_cache(brw, &brw->cache);
463       brw_cache_new_bo(&brw->cache, brw->cache.bo->size);
464    }
465 }
466 
467 
468 static void
brw_destroy_cache(struct brw_context * brw,struct brw_cache * cache)469 brw_destroy_cache(struct brw_context *brw, struct brw_cache *cache)
470 {
471 
472    DBG("%s\n", __func__);
473 
474    /* This can be NULL if context creation failed early on */
475    if (cache->bo) {
476       brw_bo_unmap(cache->bo);
477       brw_bo_unreference(cache->bo);
478       cache->bo = NULL;
479       cache->map = NULL;
480    }
481    brw_clear_cache(brw, cache);
482    free(cache->items);
483    cache->items = NULL;
484    cache->size = 0;
485 }
486 
487 
488 void
brw_destroy_caches(struct brw_context * brw)489 brw_destroy_caches(struct brw_context *brw)
490 {
491    brw_destroy_cache(brw, &brw->cache);
492 }
493 
494 static const char *
cache_name(enum brw_cache_id cache_id)495 cache_name(enum brw_cache_id cache_id)
496 {
497    switch (cache_id) {
498    case BRW_CACHE_VS_PROG:
499       return "VS kernel";
500    case BRW_CACHE_TCS_PROG:
501       return "TCS kernel";
502    case BRW_CACHE_TES_PROG:
503       return "TES kernel";
504    case BRW_CACHE_FF_GS_PROG:
505       return "Fixed-function GS kernel";
506    case BRW_CACHE_GS_PROG:
507       return "GS kernel";
508    case BRW_CACHE_CLIP_PROG:
509       return "CLIP kernel";
510    case BRW_CACHE_SF_PROG:
511       return "SF kernel";
512    case BRW_CACHE_FS_PROG:
513       return "FS kernel";
514    case BRW_CACHE_CS_PROG:
515       return "CS kernel";
516    default:
517       return "unknown";
518    }
519 }
520 
521 void
brw_print_program_cache(struct brw_context * brw)522 brw_print_program_cache(struct brw_context *brw)
523 {
524    const struct brw_cache *cache = &brw->cache;
525    struct brw_cache_item *item;
526 
527    for (unsigned i = 0; i < cache->size; i++) {
528       for (item = cache->items[i]; item; item = item->next) {
529          fprintf(stderr, "%s:\n", cache_name(i));
530          brw_disassemble(&brw->screen->devinfo, cache->map,
531                          item->offset, item->size, stderr);
532       }
533    }
534 }
535