1 /*
2  *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "./vpx_config.h"
12 #include "vpx_dsp/vpx_dsp_common.h"
13 #include "vpx_mem/vpx_mem.h"
14 #include "vp9/common/vp9_entropymode.h"
15 #include "vp9/common/vp9_thread_common.h"
16 #include "vp9/common/vp9_reconinter.h"
17 #include "vp9/common/vp9_loopfilter.h"
18 
19 #if CONFIG_MULTITHREAD
mutex_lock(pthread_mutex_t * const mutex)20 static INLINE void mutex_lock(pthread_mutex_t *const mutex) {
21   const int kMaxTryLocks = 4000;
22   int locked = 0;
23   int i;
24 
25   for (i = 0; i < kMaxTryLocks; ++i) {
26     if (!pthread_mutex_trylock(mutex)) {
27       locked = 1;
28       break;
29     }
30   }
31 
32   if (!locked)
33     pthread_mutex_lock(mutex);
34 }
35 #endif  // CONFIG_MULTITHREAD
36 
sync_read(VP9LfSync * const lf_sync,int r,int c)37 static INLINE void sync_read(VP9LfSync *const lf_sync, int r, int c) {
38 #if CONFIG_MULTITHREAD
39   const int nsync = lf_sync->sync_range;
40 
41   if (r && !(c & (nsync - 1))) {
42     pthread_mutex_t *const mutex = &lf_sync->mutex_[r - 1];
43     mutex_lock(mutex);
44 
45     while (c > lf_sync->cur_sb_col[r - 1] - nsync) {
46       pthread_cond_wait(&lf_sync->cond_[r - 1], mutex);
47     }
48     pthread_mutex_unlock(mutex);
49   }
50 #else
51   (void)lf_sync;
52   (void)r;
53   (void)c;
54 #endif  // CONFIG_MULTITHREAD
55 }
56 
sync_write(VP9LfSync * const lf_sync,int r,int c,const int sb_cols)57 static INLINE void sync_write(VP9LfSync *const lf_sync, int r, int c,
58                               const int sb_cols) {
59 #if CONFIG_MULTITHREAD
60   const int nsync = lf_sync->sync_range;
61   int cur;
62   // Only signal when there are enough filtered SB for next row to run.
63   int sig = 1;
64 
65   if (c < sb_cols - 1) {
66     cur = c;
67     if (c % nsync)
68       sig = 0;
69   } else {
70     cur = sb_cols + nsync;
71   }
72 
73   if (sig) {
74     mutex_lock(&lf_sync->mutex_[r]);
75 
76     lf_sync->cur_sb_col[r] = cur;
77 
78     pthread_cond_signal(&lf_sync->cond_[r]);
79     pthread_mutex_unlock(&lf_sync->mutex_[r]);
80   }
81 #else
82   (void)lf_sync;
83   (void)r;
84   (void)c;
85   (void)sb_cols;
86 #endif  // CONFIG_MULTITHREAD
87 }
88 
89 // Implement row loopfiltering for each thread.
90 static INLINE
thread_loop_filter_rows(const YV12_BUFFER_CONFIG * const frame_buffer,VP9_COMMON * const cm,struct macroblockd_plane planes[MAX_MB_PLANE],int start,int stop,int y_only,VP9LfSync * const lf_sync)91 void thread_loop_filter_rows(const YV12_BUFFER_CONFIG *const frame_buffer,
92                              VP9_COMMON *const cm,
93                              struct macroblockd_plane planes[MAX_MB_PLANE],
94                              int start, int stop, int y_only,
95                              VP9LfSync *const lf_sync) {
96   const int num_planes = y_only ? 1 : MAX_MB_PLANE;
97   const int sb_cols = mi_cols_aligned_to_sb(cm->mi_cols) >> MI_BLOCK_SIZE_LOG2;
98   int mi_row, mi_col;
99   enum lf_path path;
100   if (y_only)
101     path = LF_PATH_444;
102   else if (planes[1].subsampling_y == 1 && planes[1].subsampling_x == 1)
103     path = LF_PATH_420;
104   else if (planes[1].subsampling_y == 0 && planes[1].subsampling_x == 0)
105     path = LF_PATH_444;
106   else
107     path = LF_PATH_SLOW;
108 
109   for (mi_row = start; mi_row < stop;
110        mi_row += lf_sync->num_workers * MI_BLOCK_SIZE) {
111     MODE_INFO **const mi = cm->mi_grid_visible + mi_row * cm->mi_stride;
112     LOOP_FILTER_MASK *lfm = get_lfm(&cm->lf, mi_row, 0);
113 
114     for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MI_BLOCK_SIZE, ++lfm) {
115       const int r = mi_row >> MI_BLOCK_SIZE_LOG2;
116       const int c = mi_col >> MI_BLOCK_SIZE_LOG2;
117       int plane;
118 
119       sync_read(lf_sync, r, c);
120 
121       vp9_setup_dst_planes(planes, frame_buffer, mi_row, mi_col);
122 
123       vp9_adjust_mask(cm, mi_row, mi_col, lfm);
124 
125       vp9_filter_block_plane_ss00(cm, &planes[0], mi_row, lfm);
126       for (plane = 1; plane < num_planes; ++plane) {
127         switch (path) {
128           case LF_PATH_420:
129             vp9_filter_block_plane_ss11(cm, &planes[plane], mi_row, lfm);
130             break;
131           case LF_PATH_444:
132             vp9_filter_block_plane_ss00(cm, &planes[plane], mi_row, lfm);
133             break;
134           case LF_PATH_SLOW:
135             vp9_filter_block_plane_non420(cm, &planes[plane], mi + mi_col,
136                                           mi_row, mi_col);
137             break;
138         }
139       }
140 
141       sync_write(lf_sync, r, c, sb_cols);
142     }
143   }
144 }
145 
146 // Row-based multi-threaded loopfilter hook
loop_filter_row_worker(VP9LfSync * const lf_sync,LFWorkerData * const lf_data)147 static int loop_filter_row_worker(VP9LfSync *const lf_sync,
148                                   LFWorkerData *const lf_data) {
149   thread_loop_filter_rows(lf_data->frame_buffer, lf_data->cm, lf_data->planes,
150                           lf_data->start, lf_data->stop, lf_data->y_only,
151                           lf_sync);
152   return 1;
153 }
154 
loop_filter_rows_mt(YV12_BUFFER_CONFIG * frame,VP9_COMMON * cm,struct macroblockd_plane planes[MAX_MB_PLANE],int start,int stop,int y_only,VPxWorker * workers,int nworkers,VP9LfSync * lf_sync)155 static void loop_filter_rows_mt(YV12_BUFFER_CONFIG *frame,
156                                 VP9_COMMON *cm,
157                                 struct macroblockd_plane planes[MAX_MB_PLANE],
158                                 int start, int stop, int y_only,
159                                 VPxWorker *workers, int nworkers,
160                                 VP9LfSync *lf_sync) {
161   const VPxWorkerInterface *const winterface = vpx_get_worker_interface();
162   // Number of superblock rows and cols
163   const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2;
164   // Decoder may allocate more threads than number of tiles based on user's
165   // input.
166   const int tile_cols = 1 << cm->log2_tile_cols;
167   const int num_workers = VPXMIN(nworkers, tile_cols);
168   int i;
169 
170   if (!lf_sync->sync_range || sb_rows != lf_sync->rows ||
171       num_workers > lf_sync->num_workers) {
172     vp9_loop_filter_dealloc(lf_sync);
173     vp9_loop_filter_alloc(lf_sync, cm, sb_rows, cm->width, num_workers);
174   }
175 
176   // Initialize cur_sb_col to -1 for all SB rows.
177   memset(lf_sync->cur_sb_col, -1, sizeof(*lf_sync->cur_sb_col) * sb_rows);
178 
179   // Set up loopfilter thread data.
180   // The decoder is capping num_workers because it has been observed that using
181   // more threads on the loopfilter than there are cores will hurt performance
182   // on Android. This is because the system will only schedule the tile decode
183   // workers on cores equal to the number of tile columns. Then if the decoder
184   // tries to use more threads for the loopfilter, it will hurt performance
185   // because of contention. If the multithreading code changes in the future
186   // then the number of workers used by the loopfilter should be revisited.
187   for (i = 0; i < num_workers; ++i) {
188     VPxWorker *const worker = &workers[i];
189     LFWorkerData *const lf_data = &lf_sync->lfdata[i];
190 
191     worker->hook = (VPxWorkerHook)loop_filter_row_worker;
192     worker->data1 = lf_sync;
193     worker->data2 = lf_data;
194 
195     // Loopfilter data
196     vp9_loop_filter_data_reset(lf_data, frame, cm, planes);
197     lf_data->start = start + i * MI_BLOCK_SIZE;
198     lf_data->stop = stop;
199     lf_data->y_only = y_only;
200 
201     // Start loopfiltering
202     if (i == num_workers - 1) {
203       winterface->execute(worker);
204     } else {
205       winterface->launch(worker);
206     }
207   }
208 
209   // Wait till all rows are finished
210   for (i = 0; i < num_workers; ++i) {
211     winterface->sync(&workers[i]);
212   }
213 }
214 
vp9_loop_filter_frame_mt(YV12_BUFFER_CONFIG * frame,VP9_COMMON * cm,struct macroblockd_plane planes[MAX_MB_PLANE],int frame_filter_level,int y_only,int partial_frame,VPxWorker * workers,int num_workers,VP9LfSync * lf_sync)215 void vp9_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame,
216                               VP9_COMMON *cm,
217                               struct macroblockd_plane planes[MAX_MB_PLANE],
218                               int frame_filter_level,
219                               int y_only, int partial_frame,
220                               VPxWorker *workers, int num_workers,
221                               VP9LfSync *lf_sync) {
222   int start_mi_row, end_mi_row, mi_rows_to_filter;
223 
224   if (!frame_filter_level) return;
225 
226   start_mi_row = 0;
227   mi_rows_to_filter = cm->mi_rows;
228   if (partial_frame && cm->mi_rows > 8) {
229     start_mi_row = cm->mi_rows >> 1;
230     start_mi_row &= 0xfffffff8;
231     mi_rows_to_filter = VPXMAX(cm->mi_rows / 8, 8);
232   }
233   end_mi_row = start_mi_row + mi_rows_to_filter;
234   vp9_loop_filter_frame_init(cm, frame_filter_level);
235 
236   loop_filter_rows_mt(frame, cm, planes, start_mi_row, end_mi_row,
237                       y_only, workers, num_workers, lf_sync);
238 }
239 
240 // Set up nsync by width.
get_sync_range(int width)241 static INLINE int get_sync_range(int width) {
242   // nsync numbers are picked by testing. For example, for 4k
243   // video, using 4 gives best performance.
244   if (width < 640)
245     return 1;
246   else if (width <= 1280)
247     return 2;
248   else if (width <= 4096)
249     return 4;
250   else
251     return 8;
252 }
253 
254 // Allocate memory for lf row synchronization
vp9_loop_filter_alloc(VP9LfSync * lf_sync,VP9_COMMON * cm,int rows,int width,int num_workers)255 void vp9_loop_filter_alloc(VP9LfSync *lf_sync, VP9_COMMON *cm, int rows,
256                            int width, int num_workers) {
257   lf_sync->rows = rows;
258 #if CONFIG_MULTITHREAD
259   {
260     int i;
261 
262     CHECK_MEM_ERROR(cm, lf_sync->mutex_,
263                     vpx_malloc(sizeof(*lf_sync->mutex_) * rows));
264     if (lf_sync->mutex_) {
265       for (i = 0; i < rows; ++i) {
266         pthread_mutex_init(&lf_sync->mutex_[i], NULL);
267       }
268     }
269 
270     CHECK_MEM_ERROR(cm, lf_sync->cond_,
271                     vpx_malloc(sizeof(*lf_sync->cond_) * rows));
272     if (lf_sync->cond_) {
273       for (i = 0; i < rows; ++i) {
274         pthread_cond_init(&lf_sync->cond_[i], NULL);
275       }
276     }
277   }
278 #endif  // CONFIG_MULTITHREAD
279 
280   CHECK_MEM_ERROR(cm, lf_sync->lfdata,
281                   vpx_malloc(num_workers * sizeof(*lf_sync->lfdata)));
282   lf_sync->num_workers = num_workers;
283 
284   CHECK_MEM_ERROR(cm, lf_sync->cur_sb_col,
285                   vpx_malloc(sizeof(*lf_sync->cur_sb_col) * rows));
286 
287   // Set up nsync.
288   lf_sync->sync_range = get_sync_range(width);
289 }
290 
291 // Deallocate lf synchronization related mutex and data
vp9_loop_filter_dealloc(VP9LfSync * lf_sync)292 void vp9_loop_filter_dealloc(VP9LfSync *lf_sync) {
293   if (lf_sync != NULL) {
294 #if CONFIG_MULTITHREAD
295     int i;
296 
297     if (lf_sync->mutex_ != NULL) {
298       for (i = 0; i < lf_sync->rows; ++i) {
299         pthread_mutex_destroy(&lf_sync->mutex_[i]);
300       }
301       vpx_free(lf_sync->mutex_);
302     }
303     if (lf_sync->cond_ != NULL) {
304       for (i = 0; i < lf_sync->rows; ++i) {
305         pthread_cond_destroy(&lf_sync->cond_[i]);
306       }
307       vpx_free(lf_sync->cond_);
308     }
309 #endif  // CONFIG_MULTITHREAD
310     vpx_free(lf_sync->lfdata);
311     vpx_free(lf_sync->cur_sb_col);
312     // clear the structure as the source of this call may be a resize in which
313     // case this call will be followed by an _alloc() which may fail.
314     vp9_zero(*lf_sync);
315   }
316 }
317 
318 // Accumulate frame counts.
vp9_accumulate_frame_counts(FRAME_COUNTS * accum,const FRAME_COUNTS * counts,int is_dec)319 void vp9_accumulate_frame_counts(FRAME_COUNTS *accum,
320                                  const FRAME_COUNTS *counts, int is_dec) {
321   int i, j, k, l, m;
322 
323   for (i = 0; i < BLOCK_SIZE_GROUPS; i++)
324     for (j = 0; j < INTRA_MODES; j++)
325       accum->y_mode[i][j] += counts->y_mode[i][j];
326 
327   for (i = 0; i < INTRA_MODES; i++)
328     for (j = 0; j < INTRA_MODES; j++)
329       accum->uv_mode[i][j] += counts->uv_mode[i][j];
330 
331   for (i = 0; i < PARTITION_CONTEXTS; i++)
332     for (j = 0; j < PARTITION_TYPES; j++)
333       accum->partition[i][j] += counts->partition[i][j];
334 
335   if (is_dec) {
336     int n;
337     for (i = 0; i < TX_SIZES; i++)
338       for (j = 0; j < PLANE_TYPES; j++)
339         for (k = 0; k < REF_TYPES; k++)
340           for (l = 0; l < COEF_BANDS; l++)
341             for (m = 0; m < COEFF_CONTEXTS; m++) {
342               accum->eob_branch[i][j][k][l][m] +=
343                   counts->eob_branch[i][j][k][l][m];
344               for (n = 0; n < UNCONSTRAINED_NODES + 1; n++)
345                 accum->coef[i][j][k][l][m][n] +=
346                     counts->coef[i][j][k][l][m][n];
347             }
348   } else {
349     for (i = 0; i < TX_SIZES; i++)
350       for (j = 0; j < PLANE_TYPES; j++)
351         for (k = 0; k < REF_TYPES; k++)
352           for (l = 0; l < COEF_BANDS; l++)
353             for (m = 0; m < COEFF_CONTEXTS; m++)
354               accum->eob_branch[i][j][k][l][m] +=
355                   counts->eob_branch[i][j][k][l][m];
356                 // In the encoder, coef is only updated at frame
357                 // level, so not need to accumulate it here.
358                 // for (n = 0; n < UNCONSTRAINED_NODES + 1; n++)
359                 //   accum->coef[i][j][k][l][m][n] +=
360                 //       counts->coef[i][j][k][l][m][n];
361   }
362 
363   for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
364     for (j = 0; j < SWITCHABLE_FILTERS; j++)
365       accum->switchable_interp[i][j] += counts->switchable_interp[i][j];
366 
367   for (i = 0; i < INTER_MODE_CONTEXTS; i++)
368     for (j = 0; j < INTER_MODES; j++)
369       accum->inter_mode[i][j] += counts->inter_mode[i][j];
370 
371   for (i = 0; i < INTRA_INTER_CONTEXTS; i++)
372     for (j = 0; j < 2; j++)
373       accum->intra_inter[i][j] += counts->intra_inter[i][j];
374 
375   for (i = 0; i < COMP_INTER_CONTEXTS; i++)
376     for (j = 0; j < 2; j++)
377       accum->comp_inter[i][j] += counts->comp_inter[i][j];
378 
379   for (i = 0; i < REF_CONTEXTS; i++)
380     for (j = 0; j < 2; j++)
381       for (k = 0; k < 2; k++)
382       accum->single_ref[i][j][k] += counts->single_ref[i][j][k];
383 
384   for (i = 0; i < REF_CONTEXTS; i++)
385     for (j = 0; j < 2; j++)
386       accum->comp_ref[i][j] += counts->comp_ref[i][j];
387 
388   for (i = 0; i < TX_SIZE_CONTEXTS; i++) {
389     for (j = 0; j < TX_SIZES; j++)
390       accum->tx.p32x32[i][j] += counts->tx.p32x32[i][j];
391 
392     for (j = 0; j < TX_SIZES - 1; j++)
393       accum->tx.p16x16[i][j] += counts->tx.p16x16[i][j];
394 
395     for (j = 0; j < TX_SIZES - 2; j++)
396       accum->tx.p8x8[i][j] += counts->tx.p8x8[i][j];
397   }
398 
399   for (i = 0; i < TX_SIZES; i++)
400     accum->tx.tx_totals[i] += counts->tx.tx_totals[i];
401 
402   for (i = 0; i < SKIP_CONTEXTS; i++)
403     for (j = 0; j < 2; j++)
404       accum->skip[i][j] += counts->skip[i][j];
405 
406   for (i = 0; i < MV_JOINTS; i++)
407     accum->mv.joints[i] += counts->mv.joints[i];
408 
409   for (k = 0; k < 2; k++) {
410     nmv_component_counts *const comps = &accum->mv.comps[k];
411     const nmv_component_counts *const comps_t = &counts->mv.comps[k];
412 
413     for (i = 0; i < 2; i++) {
414       comps->sign[i] += comps_t->sign[i];
415       comps->class0_hp[i] += comps_t->class0_hp[i];
416       comps->hp[i] += comps_t->hp[i];
417     }
418 
419     for (i = 0; i < MV_CLASSES; i++)
420       comps->classes[i] += comps_t->classes[i];
421 
422     for (i = 0; i < CLASS0_SIZE; i++) {
423       comps->class0[i] += comps_t->class0[i];
424       for (j = 0; j < MV_FP_SIZE; j++)
425         comps->class0_fp[i][j] += comps_t->class0_fp[i][j];
426     }
427 
428     for (i = 0; i < MV_OFFSET_BITS; i++)
429       for (j = 0; j < 2; j++)
430         comps->bits[i][j] += comps_t->bits[i][j];
431 
432     for (i = 0; i < MV_FP_SIZE; i++)
433       comps->fp[i] += comps_t->fp[i];
434   }
435 }
436