1 // Copyright 2020 The libgav1 Authors
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 #include "src/post_filter.h"
15 #include "src/utils/blocking_counter.h"
16 
17 namespace libgav1 {
18 
ApplySuperRes(const std::array<uint8_t *,kMaxPlanes> & src,const std::array<int,kMaxPlanes> & rows,const int line_buffer_row,const std::array<uint8_t *,kMaxPlanes> & dst,bool dst_is_loop_restoration_border)19 void PostFilter::ApplySuperRes(const std::array<uint8_t*, kMaxPlanes>& src,
20                                const std::array<int, kMaxPlanes>& rows,
21                                const int line_buffer_row,
22                                const std::array<uint8_t*, kMaxPlanes>& dst,
23                                bool dst_is_loop_restoration_border /*=false*/) {
24   int plane = kPlaneY;
25   do {
26     const int plane_width =
27         MultiplyBy4(frame_header_.columns4x4) >> subsampling_x_[plane];
28 #if LIBGAV1_MAX_BITDEPTH >= 10
29     if (bitdepth_ >= 10) {
30       auto* input = reinterpret_cast<uint16_t*>(src[plane]);
31       auto* output = reinterpret_cast<uint16_t*>(dst[plane]);
32       const ptrdiff_t input_stride =
33           frame_buffer_.stride(plane) / sizeof(uint16_t);
34       const ptrdiff_t output_stride =
35           (dst_is_loop_restoration_border
36                ? loop_restoration_border_.stride(plane)
37                : frame_buffer_.stride(plane)) /
38           sizeof(uint16_t);
39       if (rows[plane] > 0) {
40         dsp_.super_res(superres_coefficients_[static_cast<int>(plane != 0)],
41                        input, input_stride, rows[plane], plane_width,
42                        super_res_info_[plane].upscaled_width,
43                        super_res_info_[plane].initial_subpixel_x,
44                        super_res_info_[plane].step, output, output_stride);
45       }
46       // In the multi-threaded case, the |superres_line_buffer_| holds the last
47       // input row. Apply SuperRes for that row.
48       if (line_buffer_row >= 0) {
49         auto* const line_buffer_start =
50             reinterpret_cast<uint16_t*>(superres_line_buffer_.data(plane)) +
51             line_buffer_row * superres_line_buffer_.stride(plane) /
52                 sizeof(uint16_t) +
53             kSuperResHorizontalBorder;
54         dsp_.super_res(superres_coefficients_[static_cast<int>(plane != 0)],
55                        line_buffer_start, /*source_stride=*/0,
56                        /*height=*/1, plane_width,
57                        super_res_info_[plane].upscaled_width,
58                        super_res_info_[plane].initial_subpixel_x,
59                        super_res_info_[plane].step,
60                        output + rows[plane] * output_stride, /*dest_stride=*/0);
61       }
62       continue;
63     }
64 #endif  // LIBGAV1_MAX_BITDEPTH >= 10
65     uint8_t* input = src[plane];
66     uint8_t* output = dst[plane];
67     const ptrdiff_t input_stride = frame_buffer_.stride(plane);
68     const ptrdiff_t output_stride = dst_is_loop_restoration_border
69                                         ? loop_restoration_border_.stride(plane)
70                                         : frame_buffer_.stride(plane);
71     if (rows[plane] > 0) {
72       dsp_.super_res(superres_coefficients_[static_cast<int>(plane != 0)],
73                      input, input_stride, rows[plane], plane_width,
74                      super_res_info_[plane].upscaled_width,
75                      super_res_info_[plane].initial_subpixel_x,
76                      super_res_info_[plane].step, output, output_stride);
77     }
78     // In the multi-threaded case, the |superres_line_buffer_| holds the last
79     // input row. Apply SuperRes for that row.
80     if (line_buffer_row >= 0) {
81       uint8_t* const line_buffer_start =
82           superres_line_buffer_.data(plane) +
83           line_buffer_row * superres_line_buffer_.stride(plane) +
84           kSuperResHorizontalBorder;
85       dsp_.super_res(
86           superres_coefficients_[static_cast<int>(plane != 0)],
87           line_buffer_start, /*source_stride=*/0,
88           /*height=*/1, plane_width, super_res_info_[plane].upscaled_width,
89           super_res_info_[plane].initial_subpixel_x,
90           super_res_info_[plane].step, output + rows[plane] * output_stride,
91           /*dest_stride=*/0);
92     }
93   } while (++plane < planes_);
94 }
95 
ApplySuperResForOneSuperBlockRow(int row4x4_start,int sb4x4,bool is_last_row)96 void PostFilter::ApplySuperResForOneSuperBlockRow(int row4x4_start, int sb4x4,
97                                                   bool is_last_row) {
98   assert(row4x4_start >= 0);
99   assert(DoSuperRes());
100   // If not doing cdef, then LR needs two rows of border with superres applied.
101   const int num_rows_extra = (DoCdef() || !DoRestoration()) ? 0 : 2;
102   std::array<uint8_t*, kMaxPlanes> src;
103   std::array<uint8_t*, kMaxPlanes> dst;
104   std::array<int, kMaxPlanes> rows;
105   const int num_rows4x4 =
106       std::min(sb4x4, frame_header_.rows4x4 - row4x4_start) -
107       (is_last_row ? 0 : 2);
108   if (row4x4_start > 0) {
109     const int row4x4 = row4x4_start - 2;
110     int plane = kPlaneY;
111     do {
112       const int row =
113           (MultiplyBy4(row4x4) >> subsampling_y_[plane]) + num_rows_extra;
114       const ptrdiff_t row_offset = row * frame_buffer_.stride(plane);
115       src[plane] = cdef_buffer_[plane] + row_offset;
116       dst[plane] = superres_buffer_[plane] + row_offset;
117       // Note that the |num_rows_extra| subtraction is done after the value is
118       // subsampled since we always need to work on |num_rows_extra| extra rows
119       // irrespective of the plane subsampling.
120       // Apply superres for the last 8-|num_rows_extra| rows of the previous
121       // superblock.
122       rows[plane] = (8 >> subsampling_y_[plane]) - num_rows_extra;
123       // Apply superres for the current superblock row (except for the last
124       // 8-|num_rows_extra| rows).
125       rows[plane] += (MultiplyBy4(num_rows4x4) >> subsampling_y_[plane]) +
126                      (is_last_row ? 0 : num_rows_extra);
127     } while (++plane < planes_);
128   } else {
129     // Apply superres for the current superblock row (except for the last
130     // 8-|num_rows_extra| rows).
131     int plane = kPlaneY;
132     do {
133       const ptrdiff_t row_offset =
134           (MultiplyBy4(row4x4_start) >> subsampling_y_[plane]) *
135           frame_buffer_.stride(plane);
136       src[plane] = cdef_buffer_[plane] + row_offset;
137       dst[plane] = superres_buffer_[plane] + row_offset;
138       // Note that the |num_rows_extra| addition is done after the value is
139       // subsampled since we always need to work on |num_rows_extra| extra rows
140       // irrespective of the plane subsampling.
141       rows[plane] = (MultiplyBy4(num_rows4x4) >> subsampling_y_[plane]) +
142                     (is_last_row ? 0 : num_rows_extra);
143     } while (++plane < planes_);
144   }
145   ApplySuperRes(src, rows, /*line_buffer_row=*/-1, dst);
146 }
147 
ApplySuperResThreaded()148 void PostFilter::ApplySuperResThreaded() {
149   int num_threads = thread_pool_->num_threads() + 1;
150   // The number of rows that will be processed by each thread in the thread pool
151   // (other than the current thread).
152   int thread_pool_rows = height_ / num_threads;
153   thread_pool_rows = std::max(thread_pool_rows, 1);
154   // Make rows of Y plane even when there is subsampling for the other planes.
155   if ((thread_pool_rows & 1) != 0 && subsampling_y_[kPlaneU] != 0) {
156     ++thread_pool_rows;
157   }
158   // Adjust the number of threads to what we really need.
159   num_threads = Clip3(height_ / thread_pool_rows, 1, num_threads);
160   // For the current thread, we round up to process all the remaining rows.
161   int current_thread_rows = height_ - thread_pool_rows * (num_threads - 1);
162   // Make rows of Y plane even when there is subsampling for the other planes.
163   if ((current_thread_rows & 1) != 0 && subsampling_y_[kPlaneU] != 0) {
164     ++current_thread_rows;
165   }
166   assert(current_thread_rows > 0);
167   BlockingCounter pending_workers(num_threads - 1);
168   for (int line_buffer_row = 0, row_start = 0; line_buffer_row < num_threads;
169        ++line_buffer_row, row_start += thread_pool_rows) {
170     std::array<uint8_t*, kMaxPlanes> src;
171     std::array<uint8_t*, kMaxPlanes> dst;
172     std::array<int, kMaxPlanes> rows;
173     int plane = kPlaneY;
174     const int pixel_size_log2 = pixel_size_log2_;
175     do {
176       src[plane] =
177           GetBufferOffset(cdef_buffer_[plane], frame_buffer_.stride(plane),
178                           static_cast<Plane>(plane), row_start, 0);
179       dst[plane] =
180           GetBufferOffset(superres_buffer_[plane], frame_buffer_.stride(plane),
181                           static_cast<Plane>(plane), row_start, 0);
182       rows[plane] =
183           (((line_buffer_row < num_threads - 1) ? thread_pool_rows
184                                                 : current_thread_rows) >>
185            subsampling_y_[plane]) -
186           1;
187       const int plane_width =
188           MultiplyBy4(frame_header_.columns4x4) >> subsampling_x_[plane];
189       uint8_t* const input =
190           src[plane] + rows[plane] * frame_buffer_.stride(plane);
191       uint8_t* const line_buffer_start =
192           superres_line_buffer_.data(plane) +
193           line_buffer_row * superres_line_buffer_.stride(plane) +
194           (kSuperResHorizontalBorder << pixel_size_log2);
195       memcpy(line_buffer_start, input, plane_width << pixel_size_log2);
196     } while (++plane < planes_);
197     if (line_buffer_row < num_threads - 1) {
198       thread_pool_->Schedule(
199           [this, src, rows, line_buffer_row, dst, &pending_workers]() {
200             ApplySuperRes(src, rows, line_buffer_row, dst);
201             pending_workers.Decrement();
202           });
203     } else {
204       ApplySuperRes(src, rows, line_buffer_row, dst);
205     }
206   }
207   // Wait for the threadpool jobs to finish.
208   pending_workers.Wait();
209 }
210 
211 }  // namespace libgav1
212