1@/******************************************************************************
2@ *
3@ * Copyright (C) 2015 The Android Open Source Project
4@ *
5@ * Licensed under the Apache License, Version 2.0 (the "License");
6@ * you may not use this file except in compliance with the License.
7@ * You may obtain a copy of the License at:
8@ *
9@ * http://www.apache.org/licenses/LICENSE-2.0
10@ *
11@ * Unless required by applicable law or agreed to in writing, software
12@ * distributed under the License is distributed on an "AS IS" BASIS,
13@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14@ * See the License for the specific language governing permissions and
15@ * limitations under the License.
16@ *
17@ *****************************************************************************
18@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19@*/
20@**
21@******************************************************************************
22@* @file
23@*  ih264_weighted_pred_a9q.s
24@*
25@* @brief
26@*  Contains function definitions for weighted prediction.
27@*
28@* @author
29@*  Kaushik Senthoor R
30@*
31@* @par List of Functions:
32@*
33@*  - ih264_weighted_pred_luma_a9q()
34@*  - ih264_weighted_pred_chroma_a9q()
35@*
36@* @remarks
37@*  None
38@*
39@*******************************************************************************
40@*
41@*******************************************************************************
42@* @function
43@*  ih264_weighted_pred_luma_a9q()
44@*
45@* @brief
46@*  This routine performs the default weighted prediction as described in sec
47@* 8.4.2.3.2 titled "Weighted sample prediction process" for luma.
48@*
49@* @par Description:
50@*  This function gets a ht x wd block, calculates the weighted sample, rounds
51@* off, adds offset and stores it in the destination block.
52@*
53@* @param[in] pu1_src:
54@*  UWORD8 Pointer to the buffer containing the input block.
55@*
56@* @param[out] pu1_dst
57@*  UWORD8 pointer to the destination where the output block is stored.
58@*
59@* @param[in] src_strd
60@*  Stride of the input buffer
61@*
62@* @param[in] dst_strd
63@*  Stride of the destination buffer
64@*
65@* @param[in] log_wd
66@*  number of bits to be rounded off
67@*
68@* @param[in] wt
69@*  weight for the weighted prediction
70@*
71@* @param[in] ofst
72@*  offset used after rounding off
73@*
74@* @param[in] ht
75@*  integer height of the array
76@*
77@* @param[in] wd
78@*  integer width of the array
79@*
80@* @returns
81@*  None
82@*
83@* @remarks
84@*  (ht,wd) can be (4,4), (4,8), (8,4), (8,8), (8,16), (16,8) or (16,16).
85@*
86@*******************************************************************************
87@*
88@void ih264_weighted_pred_luma_a9q(UWORD8 *pu1_src,
89@                                  UWORD8 *pu1_dst,
90@                                  WORD32 src_strd,
91@                                  WORD32 dst_strd,
92@                                  WORD32 log_wd,
93@                                  WORD32 wt,
94@                                  WORD32 ofst,
95@                                  WORD32 ht,
96@                                  WORD32 wd)
97@
98@**************Variables Vs Registers*****************************************
99@   r0      => pu1_src
100@   r1      => pu1_dst
101@   r2      => src_strd
102@   r3      => dst_strd
103@   [sp]    => log_wd (r4)
104@   [sp+4]  => wt     (r5)
105@   [sp+8]  => ofst   (r6)
106@   [sp+12] => ht     (r7)
107@   [sp+16] => wd     (r8)
108@
109.text
110.p2align 2
111
112    .global ih264_weighted_pred_luma_a9q
113
114ih264_weighted_pred_luma_a9q:
115
116    stmfd         sp!, {r4-r9, r14}     @stack stores the values of the arguments
117    ldr           r5, [sp, #32]         @Load wt
118    ldr           r4, [sp, #28]         @Load log_wd in r4
119    ldr           r6, [sp, #36]         @Load ofst
120    ldr           r7, [sp, #40]         @Load ht
121    ldr           r8, [sp, #44]         @Load wd
122    vpush         {d8-d15}
123
124    vdup.16       d2, r5                @D2 = wt (16-bit)
125    neg           r9, r4                @r9 = -log_wd
126    vdup.8        d3, r6                @D3 = ofst (8-bit)
127    cmp           r8, #16               @check if wd is 16
128    vdup.16       q0, r9                @Q0 = -log_wd (16-bit)
129    beq           loop_16               @branch if wd is 16
130
131    cmp           r8, #8                @check if wd is 8
132    beq           loop_8                @branch if wd is 8
133
134loop_4:                                 @each iteration processes four rows
135
136    vld1.32       d4[0], [r0], r2       @load row 1 in source
137    vld1.32       d4[1], [r0], r2       @load row 2 in source
138    vld1.32       d6[0], [r0], r2       @load row 3 in source
139    vld1.32       d6[1], [r0], r2       @load row 4 in source
140
141    vmovl.u8      q2, d4                @converting rows 1,2 to 16-bit
142    vmovl.u8      q3, d6                @converting rows 3,4 to 16-bit
143
144    vmul.s16      q2, q2, d2[0]         @weight mult. for rows 1,2
145    vmul.s16      q3, q3, d2[0]         @weight mult. for rows 3,4
146
147    subs          r7, r7, #4            @decrement ht by 4
148    vrshl.s16     q2, q2, q0            @rounds off the weighted samples from rows 1,2
149    vrshl.s16     q3, q3, q0            @rounds off the weighted samples from rows 3,4
150
151    vaddw.s8      q2, q2, d3            @adding offset for rows 1,2
152    vaddw.s8      q3, q3, d3            @adding offset for rows 3,4
153
154    vqmovun.s16   d4, q2                @saturating rows 1,2 to unsigned 8-bit
155    vqmovun.s16   d6, q3                @saturating rows 3,4 to unsigned 8-bit
156
157    vst1.32       d4[0], [r1], r3       @store row 1 in destination
158    vst1.32       d4[1], [r1], r3       @store row 2 in destination
159    vst1.32       d6[0], [r1], r3       @store row 3 in destination
160    vst1.32       d6[1], [r1], r3       @store row 4 in destination
161
162    bgt           loop_4                @if greater than 0 repeat the loop again
163
164    b             end_loops
165
166loop_8:                                 @each iteration processes four rows
167
168    vld1.8        d4, [r0], r2          @load row 1 in source
169    vld1.8        d6, [r0], r2          @load row 2 in source
170    vld1.8        d8, [r0], r2          @load row 3 in source
171    vmovl.u8      q2, d4                @converting row 1 to 16-bit
172    vld1.8        d10, [r0], r2         @load row 4 in source
173    vmovl.u8      q3, d6                @converting row 2 to 16-bit
174
175    vmovl.u8      q4, d8                @converting row 3 to 16-bit
176    vmul.s16      q2, q2, d2[0]         @weight mult. for row 1
177    vmovl.u8      q5, d10               @converting row 4 to 16-bit
178    vmul.s16      q3, q3, d2[0]         @weight mult. for row 2
179    vmul.s16      q4, q4, d2[0]         @weight mult. for row 3
180    vmul.s16      q5, q5, d2[0]         @weight mult. for row 4
181
182    vrshl.s16     q2, q2, q0            @rounds off the weighted samples from row 1
183    vrshl.s16     q3, q3, q0            @rounds off the weighted samples from row 2
184    vrshl.s16     q4, q4, q0            @rounds off the weighted samples from row 3
185    vaddw.s8      q2, q2, d3            @adding offset for row 1
186    vrshl.s16     q5, q5, q0            @rounds off the weighted samples from row 4
187    vaddw.s8      q3, q3, d3            @adding offset for row 2
188
189    vaddw.s8      q4, q4, d3            @adding offset for row 3
190    vqmovun.s16   d4, q2                @saturating row 1 to unsigned 8-bit
191    vaddw.s8      q5, q5, d3            @adding offset for row 4
192    vqmovun.s16   d6, q3                @saturating row 2 to unsigned 8-bit
193    vqmovun.s16   d8, q4                @saturating row 3 to unsigned 8-bit
194    vqmovun.s16   d10, q5               @saturating row 4 to unsigned 8-bit
195
196    vst1.8        d4, [r1], r3          @store row 1 in destination
197    vst1.8        d6, [r1], r3          @store row 2 in destination
198    subs          r7, r7, #4            @decrement ht by 4
199    vst1.8        d8, [r1], r3          @store row 3 in destination
200    vst1.8        d10, [r1], r3         @store row 4 in destination
201
202    bgt           loop_8                @if greater than 0 repeat the loop again
203
204    b             end_loops
205
206loop_16:                                @each iteration processes two rows
207
208    vld1.8        {q2}, [r0], r2        @load row 1 in source
209    vld1.8        {q3}, [r0], r2        @load row 2 in source
210    vmovl.u8      q6, d4                @converting row 1L to 16-bit
211    vld1.8        {q4}, [r0], r2        @load row 3 in source
212    vmovl.u8      q7, d5                @converting row 1H to 16-bit
213    vld1.8        {q5}, [r0], r2        @load row 4 in source
214
215    vmovl.u8      q8, d6                @converting row 2L to 16-bit
216    vmul.s16      q6, q6, d2[0]         @weight mult. for row 1L
217    vmovl.u8      q9, d7                @converting row 2H to 16-bit
218    vmul.s16      q7, q7, d2[0]         @weight mult. for row 1H
219    vmovl.u8      q10, d8               @converting row 3L to 16-bit
220    vmul.s16      q8, q8, d2[0]         @weight mult. for row 2L
221    vmovl.u8      q11, d9               @converting row 3H to 16-bit
222    vmul.s16      q9, q9, d2[0]         @weight mult. for row 2H
223    vmovl.u8      q12, d10              @converting row 4L to 16-bit
224    vmul.s16      q10, q10, d2[0]       @weight mult. for row 3L
225    vmovl.u8      q13, d11              @converting row 4H to 16-bit
226    vmul.s16      q11, q11, d2[0]       @weight mult. for row 3H
227
228    vmul.s16      q12, q12, d2[0]       @weight mult. for row 4L
229    vrshl.s16     q6, q6, q0            @rounds off the weighted samples from row 1L
230    vmul.s16      q13, q13, d2[0]       @weight mult. for row 4H
231
232    vrshl.s16     q7, q7, q0            @rounds off the weighted samples from row 1H
233    vrshl.s16     q8, q8, q0            @rounds off the weighted samples from row 2L
234    vaddw.s8      q6, q6, d3            @adding offset for row 1L
235    vrshl.s16     q9, q9, q0            @rounds off the weighted samples from row 2H
236    vaddw.s8      q7, q7, d3            @adding offset for row 1H
237    vqmovun.s16   d4, q6                @saturating row 1L to unsigned 8-bit
238    vrshl.s16     q10, q10, q0          @rounds off the weighted samples from row 3L
239    vaddw.s8      q8, q8, d3            @adding offset for row 2L
240    vqmovun.s16   d5, q7                @saturating row 1H to unsigned 8-bit
241    vrshl.s16     q11, q11, q0          @rounds off the weighted samples from row 3H
242    vaddw.s8      q9, q9, d3            @adding offset for row 2H
243    vqmovun.s16   d6, q8                @saturating row 2L to unsigned 8-bit
244    vrshl.s16     q12, q12, q0          @rounds off the weighted samples from row 4L
245    vaddw.s8      q10, q10, d3          @adding offset for row 3L
246    vqmovun.s16   d7, q9                @saturating row 2H to unsigned 8-bit
247    vrshl.s16     q13, q13, q0          @rounds off the weighted samples from row 4H
248    vaddw.s8      q11, q11, d3          @adding offset for row 3H
249
250    vqmovun.s16   d8, q10               @saturating row 3L to unsigned 8-bit
251    vaddw.s8      q12, q12, d3          @adding offset for row 4L
252    vqmovun.s16   d9, q11               @saturating row 3H to unsigned 8-bit
253    vaddw.s8      q13, q13, d3          @adding offset for row 4H
254
255    vqmovun.s16   d10, q12              @saturating row 4L to unsigned 8-bit
256    vst1.8        {q2}, [r1], r3        @store row 1 in destination
257    vqmovun.s16   d11, q13              @saturating row 4H to unsigned 8-bit
258    vst1.8        {q3}, [r1], r3        @store row 2 in destination
259    subs          r7, r7, #4            @decrement ht by 4
260    vst1.8        {q4}, [r1], r3        @store row 3 in destination
261    vst1.8        {q5}, [r1], r3        @store row 4 in destination
262
263    bgt           loop_16               @if greater than 0 repeat the loop again
264
265end_loops:
266
267    vpop          {d8-d15}
268    ldmfd         sp!, {r4-r9, r15}     @Reload the registers from sp
269
270
271@*******************************************************************************
272@* @function
273@*  ih264_weighted_pred_chroma_a9q()
274@*
275@* @brief
276@*  This routine performs the default weighted prediction as described in sec
277@* 8.4.2.3.2 titled "Weighted sample prediction process" for chroma.
278@*
279@* @par Description:
280@*  This function gets a ht x wd block, calculates the weighted sample, rounds
281@* off, adds offset and stores it in the destination block for U and V.
282@*
283@* @param[in] pu1_src:
284@*  UWORD8 Pointer to the buffer containing the input block.
285@*
286@* @param[out] pu1_dst
287@*  UWORD8 pointer to the destination where the output block is stored.
288@*
289@* @param[in] src_strd
290@*  Stride of the input buffer
291@*
292@* @param[in] dst_strd
293@*  Stride of the destination buffer
294@*
295@* @param[in] log_wd
296@*  number of bits to be rounded off
297@*
298@* @param[in] wt
299@*  weights for the weighted prediction for U and V
300@*
301@* @param[in] ofst
302@*  offsets used after rounding off for U and V
303@*
304@* @param[in] ht
305@*  integer height of the array
306@*
307@* @param[in] wd
308@*  integer width of the array
309@*
310@* @returns
311@*  None
312@*
313@* @remarks
314@*  (ht,wd) can be (2,2), (2,4), (4,2), (4,4), (4,8), (8,4) or (8,8).
315@*
316@*******************************************************************************
317@*
318@void ih264_weighted_pred_chroma_a9q(UWORD8 *pu1_src,
319@                                    UWORD8 *pu1_dst,
320@                                    WORD32 src_strd,
321@                                    WORD32 dst_strd,
322@                                    WORD32 log_wd,
323@                                    WORD32 wt,
324@                                    WORD32 ofst,
325@                                    WORD32 ht,
326@                                    WORD32 wd)
327@
328@**************Variables Vs Registers*****************************************
329@   r0      => pu1_src
330@   r1      => pu1_dst
331@   r2      => src_strd
332@   r3      => dst_strd
333@   [sp]    => log_wd (r4)
334@   [sp+4]  => wt     (r5)
335@   [sp+8]  => ofst   (r6)
336@   [sp+12] => ht     (r7)
337@   [sp+16] => wd     (r8)
338@
339
340
341    .global ih264_weighted_pred_chroma_a9q
342
343ih264_weighted_pred_chroma_a9q:
344
345    stmfd         sp!, {r4-r9, r14}     @stack stores the values of the arguments
346
347    ldr           r4, [sp, #28]         @Load log_wd in r4
348    ldr           r5, [sp, #32]         @Load wt = {wt_u (16-bit), wt_v (16-bit)}
349    ldr           r6, [sp, #36]         @Load ofst = {ofst_u (8-bit), ofst_v (8-bit)}
350    ldr           r8, [sp, #44]         @Load wd
351
352    neg           r9, r4                @r9 = -log_wd
353    vdup.32       q1, r5                @Q1 = {wt_u (16-bit), wt_v (16-bit)}
354    ldr           r7, [sp, #40]         @Load ht
355    vpush         {d8-d15}
356    vdup.16       d4, r6                @D4 = {ofst_u (8-bit), ofst_v (8-bit)}
357    cmp           r8, #8                @check if wd is 8
358    vdup.16       q0, r9                @Q0 = -log_wd (16-bit)
359    beq           loop_8_uv             @branch if wd is 8
360
361    cmp           r8, #4                @check if ws is 4
362    beq           loop_4_uv             @branch if wd is 4
363
364loop_2_uv:                              @each iteration processes two rows
365
366    vld1.32       d6[0], [r0], r2       @load row 1 in source
367    vld1.32       d6[1], [r0], r2       @load row 2 in source
368
369    vmovl.u8      q3, d6                @converting rows 1,2 to 16-bit
370
371    vmul.s16      q3, q3, q1            @weight mult. for rows 1,2
372
373    vrshl.s16     q3, q3, q0            @rounds off the weighted samples from rows 1,2
374
375    vaddw.s8      q3, q3, d4            @adding offset for rows 1,2
376
377    vqmovun.s16   d6, q3                @saturating rows 1,2 to unsigned 8-bit
378
379    subs          r7, r7, #2            @decrement ht by 2
380    vst1.32       d6[0], [r1], r3       @store row 1 in destination
381    vst1.32       d6[1], [r1], r3       @store row 2 in destination
382
383    bgt           loop_2_uv             @if greater than 0 repeat the loop again
384
385    b             end_loops_uv
386
387loop_4_uv:                              @each iteration processes two rows
388
389    vld1.8        d6, [r0], r2          @load row 1 in source
390    vld1.8        d8, [r0], r2          @load row 2 in source
391
392    vmovl.u8      q3, d6                @converting row 1 to 16-bit
393    vmovl.u8      q4, d8                @converting row 2 to 16-bit
394
395    vmul.s16      q3, q3, q1            @weight mult. for row 1
396    vmul.s16      q4, q4, q1            @weight mult. for row 2
397
398    subs          r7, r7, #2            @decrement ht by 2
399    vrshl.s16     q3, q3, q0            @rounds off the weighted samples from row 1
400    vrshl.s16     q4, q4, q0            @rounds off the weighted samples from row 2
401
402    vaddw.s8      q3, q3, d4            @adding offset for row 1
403    vaddw.s8      q4, q4, d4            @adding offset for row 2
404
405    vqmovun.s16   d6, q3                @saturating row 1 to unsigned 8-bit
406    vqmovun.s16   d8, q4                @saturating row 2 to unsigned 8-bit
407
408    vst1.8        d6, [r1], r3          @store row 1 in destination
409    vst1.8        d8, [r1], r3          @store row 2 in destination
410
411    bgt           loop_4_uv             @if greater than 0 repeat the loop again
412
413    b             end_loops_uv
414
415loop_8_uv:                              @each iteration processes two rows
416
417    vld1.8        {q3}, [r0], r2        @load row 1 in source
418    vld1.8        {q4}, [r0], r2        @load row 2 in source
419    vmovl.u8      q7, d6                @converting row 1L to 16-bit
420    vld1.8        {q5}, [r0], r2        @load row 3 in source
421    vmovl.u8      q8, d7                @converting row 1H to 16-bit
422    vld1.8        {q6}, [r0], r2        @load row 4 in source
423
424    vmul.s16      q7, q7, q1            @weight mult. for row 1L
425    vmovl.u8      q9, d8                @converting row 2L to 16-bit
426    vmul.s16      q8, q8, q1            @weight mult. for row 1H
427    vmovl.u8      q10, d9               @converting row 2H to 16-bit
428    vmul.s16      q9, q9, q1            @weight mult. for row 2L
429    vmovl.u8      q11, d10              @converting row 3L to 16-bit
430    vmul.s16      q10, q10, q1          @weight mult. for row 2H
431    vmovl.u8      q12, d11              @converting row 3H to 16-bit
432    vmul.s16      q11, q11, q1          @weight mult. for row 3L
433    vmovl.u8      q13, d12              @converting row 4L to 16-bit
434    vmul.s16      q12, q12, q1          @weight mult. for row 3H
435    vmovl.u8      q14, d13              @converting row 4H to 16-bit
436
437    vmul.s16      q13, q13, q1          @weight mult. for row 4L
438    vrshl.s16     q7, q7, q0            @rounds off the weighted samples from row 1L
439    vmul.s16      q14, q14, q1          @weight mult. for row 4H
440
441    vrshl.s16     q8, q8, q0            @rounds off the weighted samples from row 1H
442    vrshl.s16     q9, q9, q0            @rounds off the weighted samples from row 2L
443    vaddw.s8      q7, q7, d4            @adding offset for row 1L
444    vrshl.s16     q10, q10, q0          @rounds off the weighted samples from row 2H
445    vaddw.s8      q8, q8, d4            @adding offset for row 1H
446    vqmovun.s16   d6, q7                @saturating row 1L to unsigned 8-bit
447    vrshl.s16     q11, q11, q0          @rounds off the weighted samples from row 3L
448    vaddw.s8      q9, q9, d4            @adding offset for row 2L
449    vqmovun.s16   d7, q8                @saturating row 1H to unsigned 8-bit
450    vrshl.s16     q12, q12, q0          @rounds off the weighted samples from row 3H
451    vaddw.s8      q10, q10, d4          @adding offset for row 2H
452    vqmovun.s16   d8, q9                @saturating row 2L to unsigned 8-bit
453    vrshl.s16     q13, q13, q0          @rounds off the weighted samples from row 4L
454    vaddw.s8      q11, q11, d4          @adding offset for row 3L
455    vqmovun.s16   d9, q10               @saturating row 2H to unsigned 8-bit
456    vrshl.s16     q14, q14, q0          @rounds off the weighted samples from row 4H
457    vaddw.s8      q12, q12, d4          @adding offset for row 3H
458
459    vqmovun.s16   d10, q11              @saturating row 3L to unsigned 8-bit
460    vaddw.s8      q13, q13, d4          @adding offset for row 4L
461    vqmovun.s16   d11, q12              @saturating row 3H to unsigned 8-bit
462    vaddw.s8      q14, q14, d4          @adding offset for row 4H
463
464    vqmovun.s16   d12, q13              @saturating row 4L to unsigned 8-bit
465    vst1.8        {q3}, [r1], r3        @store row 1 in destination
466    vqmovun.s16   d13, q14              @saturating row 4H to unsigned 8-bit
467    vst1.8        {q4}, [r1], r3        @store row 2 in destination
468    subs          r7, r7, #4            @decrement ht by 4
469    vst1.8        {q5}, [r1], r3        @store row 3 in destination
470    vst1.8        {q6}, [r1], r3        @store row 4 in destination
471
472    bgt           loop_8_uv             @if greater than 0 repeat the loop again
473
474end_loops_uv:
475
476    vpop          {d8-d15}
477    ldmfd         sp!, {r4-r9, r15}     @Reload the registers from sp
478
479
480