1@/******************************************************************************
2@ *
3@ * Copyright (C) 2015 The Android Open Source Project
4@ *
5@ * Licensed under the Apache License, Version 2.0 (the "License");
6@ * you may not use this file except in compliance with the License.
7@ * You may obtain a copy of the License at:
8@ *
9@ * http://www.apache.org/licenses/LICENSE-2.0
10@ *
11@ * Unless required by applicable law or agreed to in writing, software
12@ * distributed under the License is distributed on an "AS IS" BASIS,
13@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14@ * See the License for the specific language governing permissions and
15@ * limitations under the License.
16@ *
17@ *****************************************************************************
18@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19@*/
20@**
21@******************************************************************************
22@* @file
23@*  ih264_weighted_bi_pred_a9q.s
24@*
25@* @brief
26@*  Contains function definitions for weighted biprediction.
27@*
28@* @author
29@*  Kaushik Senthoor R
30@*
31@* @par List of Functions:
32@*
33@*  - ih264_weighted_bi_pred_luma_a9q()
34@*  - ih264_weighted_bi_pred_chroma_a9q()
35@*
36@* @remarks
37@*  None
38@*
39@*******************************************************************************
40@*
41@*******************************************************************************
42@* @function
43@*  ih264_weighted_bi_pred_luma_a9q()
44@*
45@* @brief
46@*  This routine performs the weighted biprediction as described in sec
47@* 8.4.2.3.2 titled "Weighted sample prediction process" for luma.
48@*
49@* @par Description:
50@*  This function gets two ht x wd blocks, calculates the weighted samples,
51@* rounds off, adds offset and stores it in the destination block.
52@*
53@* @param[in] pu1_src1
54@*  UWORD8 Pointer to the buffer containing the input block 1.
55@*
56@* @param[in] pu1_src2
57@*  UWORD8 Pointer to the buffer containing the input block 2.
58@*
59@* @param[out] pu1_dst
60@*  UWORD8 pointer to the destination where the output block is stored.
61@*
62@* @param[in] src_strd1
63@*  Stride of the input buffer 1
64@*
65@* @param[in] src_strd2
66@*  Stride of the input buffer 2
67@*
68@* @param[in] dst_strd
69@*  Stride of the destination buffer
70@*
71@* @param[in] log_wd
72@*  number of bits to be rounded off
73@*
74@* @param[in] wt1
75@*  weight for the weighted prediction
76@*
77@* @param[in] wt2
78@*  weight for the weighted prediction
79@*
80@* @param[in] ofst1
81@*  offset 1 used after rounding off
82@*
83@* @param[in] ofst2
84@*  offset 2 used after rounding off
85@*
86@* @param[in] ht
87@*  integer height of the array
88@*
89@* @param[in] wd
90@*  integer width of the array
91@*
92@* @returns
93@*  None
94@*
95@* @remarks
96@*  (ht,wd) can be (4,4), (4,8), (8,4), (8,8), (8,16), (16,8) or (16,16).
97@*
98@*******************************************************************************
99@*
100@void ih264_weighted_bi_pred_luma_a9q(UWORD8 *pu1_src1,
101@                                     UWORD8 *pu1_src2,
102@                                     UWORD8 *pu1_dst,
103@                                     WORD32 src_strd1,
104@                                     WORD32 src_strd2,
105@                                     WORD32 dst_strd,
106@                                     WORD32 log_wd,
107@                                     WORD32 wt1,
108@                                     WORD32 wt2,
109@                                     WORD32 ofst1,
110@                                     WORD32 ofst2,
111@                                     WORD32 ht,
112@                                     WORD32 wd)
113@
114@**************Variables Vs Registers*****************************************
115@   r0      => pu1_src1
116@   r1      => pu1_src2
117@   r2      => pu1_dst
118@   r3      => src_strd1
119@   [sp]    => src_strd2 (r4)
120@   [sp+4]  => dst_strd  (r5)
121@   [sp+8]  => log_wd    (r6)
122@   [sp+12] => wt1       (r7)
123@   [sp+16] => wt2       (r8)
124@   [sp+20] => ofst1     (r9)
125@   [sp+24] => ofst2     (r10)
126@   [sp+28] => ht        (r11)
127@   [sp+32] => wd        (r12)
128@
129.text
130.p2align 2
131
132    .global ih264_weighted_bi_pred_luma_a9q
133
134ih264_weighted_bi_pred_luma_a9q:
135
136    stmfd         sp!, {r4-r12, r14}    @stack stores the values of the arguments
137    ldr           r6, [sp, #48]         @Load log_wd in r6
138    ldr           r7, [sp, #52]         @Load wt1 in r7
139    ldr           r8, [sp, #56]         @Load wt2 in r8
140    ldr           r9, [sp, #60]         @Load ofst1 in r9
141
142    add           r6, r6, #1            @r6  = log_wd + 1
143    sxtb          r7, r7                @sign-extend 16-bit wt1 to 32-bit
144    ldr           r4, [sp, #40]         @Load src_strd2 in r4
145    ldr           r5, [sp, #44]         @Load dst_strd in r5
146    sxtb          r9, r9                @sign-extend 8-bit ofst1 to 32-bit
147    rsb           r10, r6, #0           @r13 = -(log_wd + 1)
148    ldr           r11, [sp, #68]        @Load ht in r11
149    ldr           r12, [sp, #72]        @Load wd in r12
150    vdup.16       q0, r10               @Q0  = -(log_wd + 1) (32-bit)
151    add           r9, r9, #1            @r9 = ofst1 + 1
152
153    ldr           r10, [sp, #64]        @Load ofst2 in r10
154    sxtb          r8, r8                @sign-extend 16-bit wt2 to 32-bit
155    cmp           r12, #16              @check if wd is 16
156    vpush         {d8-d15}
157    sxtb          r10, r10              @sign-extend 8-bit ofst2 to 32-bit
158    add           r9, r9, r10           @r9 = ofst1 + ofst2 + 1
159    vmov          d2, r7, r8            @D2 = {wt1(32-bit), wt2(32-bit)}
160    asr           r9, r9, #1            @r9 = ofst = (ofst1 + ofst2 + 1) >> 1
161    vdup.8        d3, r9                @D3 = ofst (8-bit)
162    beq           loop_16               @branch if wd is 16
163
164    cmp           r12, #8               @check if wd is 8
165    beq           loop_8                @branch if wd is 8
166
167loop_4:                                 @each iteration processes four rows
168
169    vld1.32       d4[0], [r0], r3       @load row 1 in source 1
170    vld1.32       d4[1], [r0], r3       @load row 2 in source 1
171    vld1.32       d6[0], [r1], r4       @load row 1 in source 2
172    vld1.32       d6[1], [r1], r4       @load row 2 in source 2
173
174    vmovl.u8      q2, d4                @converting rows 1,2 in source 1 to 16-bit
175    vld1.32       d8[0], [r0], r3       @load row 3 in source 1
176    vld1.32       d8[1], [r0], r3       @load row 4 in source 1
177    vmovl.u8      q3, d6                @converting rows 1,2 in source 2 to 16-bit
178    vld1.32       d10[0], [r1], r4      @load row 3 in source 2
179    vld1.32       d10[1], [r1], r4      @load row 4 in source 2
180
181    vmovl.u8      q4, d8                @converting rows 3,4 in source 1 to 16-bit
182    vmovl.u8      q5, d10               @converting rows 3,4 in source 2 to 16-bit
183
184    vmul.s16      q2, q2, d2[0]         @weight 1 mult. for rows 1,2
185    vmla.s16      q2, q3, d2[2]         @weight 2 mult. for rows 1,2
186    vmul.s16      q4, q4, d2[0]         @weight 1 mult. for rows 3,4
187    vmla.s16      q4, q5, d2[2]         @weight 2 mult. for rows 3,4
188
189    subs          r11, r11, #4          @decrement ht by 4
190    vrshl.s16     q2, q2, q0            @rounds off the weighted samples from rows 1,2
191    vrshl.s16     q4, q4, q0            @rounds off the weighted samples from rows 3,4
192
193    vaddw.s8      q2, q2, d3            @adding offset for rows 1,2
194    vaddw.s8      q4, q4, d3            @adding offset for rows 3,4
195
196    vqmovun.s16   d4, q2                @saturating rows 1,2 to unsigned 8-bit
197    vqmovun.s16   d8, q4                @saturating rows 3,4 to unsigned 8-bit
198
199    vst1.32       d4[0], [r2], r5       @store row 1 in destination
200    vst1.32       d4[1], [r2], r5       @store row 2 in destination
201    vst1.32       d8[0], [r2], r5       @store row 3 in destination
202    vst1.32       d8[1], [r2], r5       @store row 4 in destination
203
204    bgt           loop_4                @if greater than 0 repeat the loop again
205
206    b             end_loops
207
208loop_8:                                 @each iteration processes four rows
209
210    vld1.8        d4, [r0], r3          @load row 1 in source 1
211    vld1.8        d6, [r1], r4          @load row 1 in source 2
212    vld1.8        d8, [r0], r3          @load row 2 in source 1
213    vld1.8        d10, [r1], r4         @load row 2 in source 2
214    vmovl.u8      q2, d4                @converting row 1 in source 1 to 16-bit
215    vld1.8        d12, [r0], r3         @load row 3 in source 1
216    vld1.8        d14, [r1], r4         @load row 3 in source 2
217    vmovl.u8      q3, d6                @converting row 1 in source 2 to 16-bit
218    vld1.8        d16, [r0], r3         @load row 4 in source 1
219    vld1.8        d18, [r1], r4         @load row 4 in source 2
220
221    vmovl.u8      q4, d8                @converting row 2 in source 1 to 16-bit
222    vmovl.u8      q5, d10               @converting row 2 in source 2 to 16-bit
223
224    vmul.s16      q2, q2, d2[0]         @weight 1 mult. for row 1
225    vmla.s16      q2, q3, d2[2]         @weight 2 mult. for row 1
226    vmovl.u8      q6, d12               @converting row 3 in source 1 to 16-bit
227    vmovl.u8      q7, d14               @converting row 3 in source 2 to 16-bit
228    vmul.s16      q4, q4, d2[0]         @weight 1 mult. for row 2
229    vmla.s16      q4, q5, d2[2]         @weight 2 mult. for row 2
230    vmovl.u8      q8, d16               @converting row 4 in source 1 to 16-bit
231    vmovl.u8      q9, d18               @converting row 4 in source 2 to 16-bit
232
233    vmul.s16      q6, q6, d2[0]         @weight 1 mult. for row 3
234    vmla.s16      q6, q7, d2[2]         @weight 2 mult. for row 3
235    vmul.s16      q8, q8, d2[0]         @weight 1 mult. for row 4
236    vmla.s16      q8, q9, d2[2]         @weight 2 mult. for row 4
237
238    vrshl.s16     q2, q2, q0            @rounds off the weighted samples from row 1
239    vrshl.s16     q4, q4, q0            @rounds off the weighted samples from row 2
240    vrshl.s16     q6, q6, q0            @rounds off the weighted samples from row 3
241    vaddw.s8      q2, q2, d3            @adding offset for row 1
242    vrshl.s16     q8, q8, q0            @rounds off the weighted samples from row 4
243    vaddw.s8      q4, q4, d3            @adding offset for row 2
244
245    vaddw.s8      q6, q6, d3            @adding offset for row 3
246    vqmovun.s16   d4, q2                @saturating row 1 to unsigned 8-bit
247    vaddw.s8      q8, q8, d3            @adding offset for row 4
248    vqmovun.s16   d8, q4                @saturating row 2 to unsigned 8-bit
249
250    vqmovun.s16   d12, q6               @saturating row 3 to unsigned 8-bit
251    vqmovun.s16   d16, q8               @saturating row 4 to unsigned 8-bit
252
253    vst1.8        d4, [r2], r5          @store row 1 in destination
254    vst1.8        d8, [r2], r5          @store row 2 in destination
255    subs          r11, r11, #4          @decrement ht by 4
256    vst1.8        d12, [r2], r5         @store row 3 in destination
257    vst1.8        d16, [r2], r5         @store row 4 in destination
258
259    bgt           loop_8                @if greater than 0 repeat the loop again
260
261    b             end_loops
262
263loop_16:                                @each iteration processes two rows
264
265    vld1.8        {q2}, [r0], r3        @load row 1 in source 1
266    vld1.8        {q3}, [r1], r4        @load row 1 in source 2
267    vld1.8        {q4}, [r0], r3        @load row 2 in source 1
268    vld1.8        {q5}, [r1], r4        @load row 2 in source 2
269    vmovl.u8      q10, d4               @converting row 1L in source 1 to 16-bit
270    vld1.8        {q6}, [r0], r3        @load row 3 in source 1
271    vld1.8        {q7}, [r1], r4        @load row 3 in source 2
272    vmovl.u8      q11, d6               @converting row 1L in source 2 to 16-bit
273    vld1.8        {q8}, [r0], r3        @load row 4 in source 1
274    vld1.8        {q9}, [r1], r4        @load row 4 in source 2
275
276    vmovl.u8      q2, d5                @converting row 1H in source 1 to 16-bit
277    vmovl.u8      q3, d7                @converting row 1H in source 2 to 16-bit
278
279    vmul.s16      q10, q10, d2[0]       @weight 1 mult. for row 1L
280    vmla.s16      q10, q11, d2[2]       @weight 2 mult. for row 1L
281    vmovl.u8      q12, d8               @converting row 2L in source 1 to 16-bit
282    vmovl.u8      q13, d10              @converting row 2L in source 2 to 16-bit
283
284    vmul.s16      q2, q2, d2[0]         @weight 1 mult. for row 1H
285    vmla.s16      q2, q3, d2[2]         @weight 2 mult. for row 1H
286    vmovl.u8      q4, d9                @converting row 2H in source 1 to 16-bit
287    vmovl.u8      q5, d11               @converting row 2H in source 2 to 16-bit
288
289    vmul.s16      q12, q12, d2[0]       @weight 1 mult. for row 2L
290    vmla.s16      q12, q13, d2[2]       @weight 2 mult. for row 2L
291    vmovl.u8      q14, d12              @converting row 3L in source 1 to 16-bit
292    vmovl.u8      q15, d14              @converting row 3L in source 2 to 16-bit
293
294    vmul.s16      q4, q4, d2[0]         @weight 1 mult. for row 2H
295    vmla.s16      q4, q5, d2[2]         @weight 2 mult. for row 2H
296    vmovl.u8      q6, d13               @converting row 3H in source 1 to 16-bit
297    vmovl.u8      q7, d15               @converting row 3H in source 2 to 16-bit
298
299    vmul.s16      q14, q14, d2[0]       @weight 1 mult. for row 3L
300    vmla.s16      q14, q15, d2[2]       @weight 2 mult. for row 3L
301    vmovl.u8      q11, d16              @converting row 4L in source 1 to 16-bit
302    vmovl.u8      q3, d18               @converting row 4L in source 2 to 16-bit
303
304    vmul.s16      q6, q6, d2[0]         @weight 1 mult. for row 3H
305    vmla.s16      q6, q7, d2[2]         @weight 2 mult. for row 3H
306    vmovl.u8      q8, d17               @converting row 4H in source 1 to 16-bit
307    vmovl.u8      q9, d19               @converting row 4H in source 2 to 16-bit
308
309    vmul.s16      q11, q11, d2[0]       @weight 1 mult. for row 4L
310    vmla.s16      q11, q3, d2[2]        @weight 2 mult. for row 4L
311    vrshl.s16     q10, q10, q0          @rounds off the weighted samples from row 1L
312
313    vmul.s16      q8, q8, d2[0]         @weight 1 mult. for row 4H
314    vmla.s16      q8, q9, d2[2]         @weight 2 mult. for row 4H
315    vrshl.s16     q2, q2, q0            @rounds off the weighted samples from row 1H
316
317    vrshl.s16     q12, q12, q0          @rounds off the weighted samples from row 2L
318    vaddw.s8      q10, q10, d3          @adding offset for row 1L
319    vrshl.s16     q4, q4, q0            @rounds off the weighted samples from row 2H
320    vaddw.s8      q2, q2, d3            @adding offset for row 1H
321    vrshl.s16     q14, q14, q0          @rounds off the weighted samples from row 3L
322    vaddw.s8      q12, q12, d3          @adding offset for row 2L
323    vrshl.s16     q6, q6, q0            @rounds off the weighted samples from row 3H
324    vaddw.s8      q4, q4, d3            @adding offset for row 2H
325    vrshl.s16     q11, q11, q0          @rounds off the weighted samples from row 4L
326    vaddw.s8      q14, q14, d3          @adding offset for row 3L
327    vrshl.s16     q8, q8, q0            @rounds off the weighted samples from row 4H
328    vaddw.s8      q6, q6, d3            @adding offset for row 3H
329
330    vqmovun.s16   d26, q10              @saturating row 1L to unsigned 8-bit
331    vaddw.s8      q11, q11, d3          @adding offset for row 4L
332    vqmovun.s16   d27, q2               @saturating row 1H to unsigned 8-bit
333    vaddw.s8      q8, q8, d3            @adding offset for row 4H
334
335    vqmovun.s16   d10, q12              @saturating row 2L to unsigned 8-bit
336    vqmovun.s16   d11, q4               @saturating row 2H to unsigned 8-bit
337    vqmovun.s16   d30, q14              @saturating row 3L to unsigned 8-bit
338    vqmovun.s16   d31, q6               @saturating row 3H to unsigned 8-bit
339    vst1.8        {q13}, [r2], r5       @store row 1 in destination
340    vqmovun.s16   d14, q11              @saturating row 4L to unsigned 8-bit
341    vqmovun.s16   d15, q8               @saturating row 4H to unsigned 8-bit
342
343    vst1.8        {q5}, [r2], r5        @store row 2 in destination
344    subs          r11, r11, #4          @decrement ht by 4
345    vst1.8        {q15}, [r2], r5       @store row 3 in destination
346    vst1.8        {q7}, [r2], r5        @store row 4 in destination
347
348    bgt           loop_16               @if greater than 0 repeat the loop again
349
350end_loops:
351
352    vpop          {d8-d15}
353    ldmfd         sp!, {r4-r12, r15}    @Reload the registers from sp
354
355
356@*******************************************************************************
357@* @function
358@*  ih264_weighted_bi_pred_chroma_a9q()
359@*
360@* @brief
361@*  This routine performs the default weighted prediction as described in sec
362@* 8.4.2.3.2 titled "Weighted sample prediction process" for chroma.
363@*
364@* @par Description:
365@*  This function gets two ht x wd blocks, calculates the weighted samples,
366@* rounds off, adds offset and stores it in the destination block for U and V.
367@*
368@* @param[in] pu1_src1
369@*  UWORD8 Pointer to the buffer containing the input block 1.
370@*
371@* @param[in] pu1_src2
372@*  UWORD8 Pointer to the buffer containing the input block 2.
373@*
374@* @param[out] pu1_dst
375@*  UWORD8 pointer to the destination where the output block is stored.
376@*
377@* @param[in] src_strd1
378@*  Stride of the input buffer 1
379@*
380@* @param[in] src_strd2
381@*  Stride of the input buffer 2
382@*
383@* @param[in] dst_strd
384@*  Stride of the destination buffer
385@*
386@* @param[in] log_wd
387@*  number of bits to be rounded off
388@*
389@* @param[in] wt1
390@*  weights for the weighted prediction in U and V
391@*
392@* @param[in] wt2
393@*  weights for the weighted prediction in U and V
394@*
395@* @param[in] ofst1
396@*  offset 1 used after rounding off for U an dV
397@*
398@* @param[in] ofst2
399@*  offset 2 used after rounding off for U and V
400@*
401@* @param[in] ht
402@*  integer height of the array
403@*
404@* @param[in] wd
405@*  integer width of the array
406@*
407@* @returns
408@*  None
409@*
410@* @remarks
411@*  (ht,wd) can be (2,2), (2,4), (4,2), (4,4), (4,8), (8,4) or (8,8).
412@*
413@*******************************************************************************
414@*
415@void ih264_weighted_bi_pred_chroma_a9q(UWORD8 *pu1_src1,
416@                                       UWORD8 *pu1_src2,
417@                                       UWORD8 *pu1_dst,
418@                                       WORD32 src_strd1,
419@                                       WORD32 src_strd2,
420@                                       WORD32 dst_strd,
421@                                       WORD32 log_wd,
422@                                       WORD32 wt1,
423@                                       WORD32 wt2,
424@                                       WORD32 ofst1,
425@                                       WORD32 ofst2,
426@                                       WORD32 ht,
427@                                       WORD32 wd)
428@
429@**************Variables Vs Registers*****************************************
430@   r0      => pu1_src1
431@   r1      => pu1_src2
432@   r2      => pu1_dst
433@   r3      => src_strd1
434@   [sp]    => src_strd2 (r4)
435@   [sp+4]  => dst_strd  (r5)
436@   [sp+8]  => log_wd    (r6)
437@   [sp+12] => wt1       (r7)
438@   [sp+16] => wt2       (r8)
439@   [sp+20] => ofst1     (r9)
440@   [sp+24] => ofst2     (r10)
441@   [sp+28] => ht        (r11)
442@   [sp+32] => wd        (r12)
443@
444
445
446    .global ih264_weighted_bi_pred_chroma_a9q
447
448ih264_weighted_bi_pred_chroma_a9q:
449
450    stmfd         sp!, {r4-r12, r14}    @stack stores the values of the arguments
451
452    ldr           r6, [sp, #48]         @Load log_wd in r6
453    ldr           r7, [sp, #52]         @Load wt1 in r7
454    ldr           r8, [sp, #56]         @Load wt2 in r8
455    add           r6, r6, #1            @r6  = log_wd + 1
456    ldr           r9, [sp, #60]         @Load ofst1 in r9
457    ldr           r10, [sp, #64]        @Load ofst2 in r10
458
459    rsb           r12, r6, #0           @r12 = -(log_wd + 1)
460    ldr           r4, [sp, #40]         @Load src_strd2 in r4
461    ldr           r5, [sp, #44]         @Load dst_strd in r5
462    vdup.16       q0, r12               @Q0  = -(log_wd + 1) (16-bit)
463
464    ldr           r11, [sp, #68]        @Load ht in r11
465    vdup.32       q1, r7                @Q1 = (wt1_u, wt1_v) (32-bit)
466    ldr           r12, [sp, #72]        @Load wd in r12
467    vdup.32       q2, r8                @Q2 = (wt2_u, wt2_v) (32-bit)
468    asr           r7, r9, #8            @r7 = ofst1_v
469    asr           r8, r10, #8           @r8 = ofst2_v
470    vpush         {d8-d15}
471    sxtb          r9, r9                @sign-extend 8-bit ofst1_u to 32-bit
472    sxtb          r10, r10              @sign-extend 8-bit ofst2_u to 32-bit
473    sxtb          r7, r7                @sign-extend 8-bit ofst1_v to 32-bit
474    sxtb          r8, r8                @sign-extend 8-bit ofst2_v to 32-bit
475
476    add           r9, r9, #1            @r9 = ofst1_u + 1
477    add           r7, r7, #1            @r7 = ofst1_v + 1
478    add           r9, r9, r10           @r9 = ofst1_u + ofst2_u + 1
479    add           r7, r7, r8            @r7 = ofst1_v + ofst2_v + 1
480    asr           r9, r9, #1            @r9 = ofst_u = (ofst1_u + ofst2_u + 1) >> 1
481    asr           r7, r7, #1            @r7 = ofst_v = (ofst1_v + ofst2_v + 1) >> 1
482    cmp           r12, #8               @check if wd is 8
483    pkhbt         r9, r9, r7, lsl #16   @r9 = {ofst_u(16-bit), ofst_v(16-bit)}
484    vdup.32       q3, r9                @Q3 = {ofst_u(16-bit), ofst_v(16-bit)}
485    beq           loop_8_uv             @branch if wd is 8
486
487    cmp           r12, #4               @check if wd is 4
488    beq           loop_4_uv             @branch if wd is 4
489
490loop_2_uv:                              @each iteration processes two rows
491
492    vld1.32       d8[0], [r0], r3       @load row 1 in source 1
493    vld1.32       d8[1], [r0], r3       @load row 2 in source 1
494    vld1.32       d10[0], [r1], r4      @load row 1 in source 2
495    vld1.32       d10[1], [r1], r4      @load row 2 in source 2
496
497    vmovl.u8      q4, d8                @converting rows 1,2 in source 1 to 16-bit
498    vmovl.u8      q5, d10               @converting rows 1,2 in source 2 to 16-bit
499
500    vmul.s16      q4, q4, q1            @weight 1 mult. for rows 1,2
501    vmla.s16      q4, q5, q2            @weight 2 mult. for rows 1,2
502
503    vrshl.s16     q4, q4, q0            @rounds off the weighted samples from rows 1,2
504
505    vadd.s16      q4, q4, q3            @adding offset for rows 1,2
506
507    vqmovun.s16   d8, q4                @saturating rows 1,2 to unsigned 8-bit
508
509    vst1.32       d8[0], [r2], r5       @store row 1 in destination
510    vst1.32       d8[1], [r2], r5       @store row 2 in destination
511
512    subs          r11, r11, #2          @decrement ht by 2
513    bgt           loop_2_uv             @if greater than 0 repeat the loop again
514
515    b             end_loops_uv
516
517loop_4_uv:                              @each iteration processes two rows
518
519    vld1.8        d8, [r0], r3          @load row 1 in source 1
520    vld1.8        d10, [r1], r4         @load row 1 in source 2
521    vmovl.u8      q4, d8                @converting row 1 in source 1 to 16-bit
522    vld1.8        d12, [r0], r3         @load row 2 in source 1
523    vmovl.u8      q5, d10               @converting row 1 in source 2 to 16-bit
524    vld1.8        d14, [r1], r4         @load row 2 in source 2
525
526    vmovl.u8      q6, d12               @converting row 2 in source 1 to 16-bit
527    vmul.s16      q4, q4, q1            @weight 1 mult. for row 1
528    vmla.s16      q4, q5, q2            @weight 2 mult. for row 1
529    vmovl.u8      q7, d14               @converting row 2 in source 2 to 16-bit
530
531    vmul.s16      q6, q6, q1            @weight 1 mult. for row 2
532    vmla.s16      q6, q7, q2            @weight 2 mult. for row 2
533
534    subs          r11, r11, #2          @decrement ht by 2
535    vrshl.s16     q4, q4, q0            @rounds off the weighted samples from row 1
536    vrshl.s16     q6, q6, q0            @rounds off the weighted samples from row 2
537    vadd.s16      q4, q4, q3            @adding offset for row 1
538    vadd.s16      q6, q6, q3            @adding offset for row 2
539
540    vqmovun.s16   d8, q4                @saturating row 1 to unsigned 8-bit
541    vqmovun.s16   d12, q6               @saturating row 2 to unsigned 8-bit
542
543    vst1.8        d8, [r2], r5          @store row 1 in destination
544    vst1.8        d12, [r2], r5         @store row 2 in destination
545
546    bgt           loop_4_uv             @if greater than 0 repeat the loop again
547
548    b             end_loops_uv
549
550loop_8_uv:                              @each iteration processes two rows
551
552    vld1.8        {q4}, [r0], r3        @load row 1 in source 1
553    vld1.8        {q5}, [r1], r4        @load row 1 in source 2
554    vld1.8        {q6}, [r0], r3        @load row 2 in source 1
555    vld1.8        {q7}, [r1], r4        @load row 2 in source 2
556    vmovl.u8      q12, d8               @converting row 1L in source 1 to 16-bit
557    vld1.8        {q8}, [r0], r3        @load row 3 in source 1
558    vld1.8        {q9}, [r1], r4        @load row 3 in source 2
559    vmovl.u8      q13, d10              @converting row 1L in source 2 to 16-bit
560    vld1.8        {q10}, [r0], r3       @load row 4 in source 1
561    vld1.8        {q11}, [r1], r4       @load row 4 in source 2
562
563    vmovl.u8      q4, d9                @converting row 1H in source 1 to 16-bit
564    vmovl.u8      q5, d11               @converting row 1H in source 2 to 16-bit
565
566    vmul.s16      q12, q12, q1          @weight 1 mult. for row 1L
567    vmla.s16      q12, q13, q2          @weight 2 mult. for row 1L
568    vmovl.u8      q14, d12              @converting row 2L in source 1 to 16-bit
569    vmovl.u8      q15, d14              @converting row 2L in source 2 to 16-bit
570
571    vmul.s16      q4, q4, q1            @weight 1 mult. for row 1H
572    vmla.s16      q4, q5, q2            @weight 2 mult. for row 1H
573    vmovl.u8      q6, d13               @converting row 2H in source 1 to 16-bit
574    vmovl.u8      q7, d15               @converting row 2H in source 2 to 16-bit
575
576    vmul.s16      q14, q14, q1          @weight 1 mult. for row 2L
577    vmla.s16      q14, q15, q2          @weight 2 mult. for row 2L
578    vmovl.u8      q13, d16              @converting row 3L in source 1 to 16-bit
579    vmovl.u8      q5, d18               @converting row 3L in source 2 to 16-bit
580
581    vmul.s16      q6, q6, q1            @weight 1 mult. for row 2H
582    vmla.s16      q6, q7, q2            @weight 2 mult. for row 2H
583    vmovl.u8      q8, d17               @converting row 3H in source 1 to 16-bit
584    vmovl.u8      q9, d19               @converting row 3H in source 2 to 16-bit
585
586    vmul.s16      q13, q13, q1          @weight 1 mult. for row 3L
587    vmla.s16      q13, q5, q2           @weight 2 mult. for row 3L
588    vmovl.u8      q15, d20              @converting row 4L in source 1 to 16-bit
589    vmovl.u8      q7, d22               @converting row 4L in source 2 to 16-bit
590
591    vmul.s16      q8, q8, q1            @weight 1 mult. for row 3H
592    vmla.s16      q8, q9, q2            @weight 2 mult. for row 3H
593    vmovl.u8      q10, d21              @converting row 4H in source 1 to 16-bit
594    vmovl.u8      q11, d23              @converting row 4H in source 2 to 16-bit
595
596    vmul.s16      q15, q15, q1          @weight 1 mult. for row 4L
597    vmla.s16      q15, q7, q2           @weight 2 mult. for row 4L
598    vrshl.s16     q12, q12, q0          @rounds off the weighted samples from row 1L
599
600    vmul.s16      q10, q10, q1          @weight 1 mult. for row 4H
601    vmla.s16      q10, q11, q2          @weight 2 mult. for row 4H
602    vrshl.s16     q4, q4, q0            @rounds off the weighted samples from row 1H
603
604    vrshl.s16     q14, q14, q0          @rounds off the weighted samples from row 2L
605    vadd.s16      q12, q12, q3          @adding offset for row 1L
606    vrshl.s16     q6, q6, q0            @rounds off the weighted samples from row 2H
607    vadd.s16      q4, q4, q3            @adding offset for row 1H
608    vrshl.s16     q13, q13, q0          @rounds off the weighted samples from row 3L
609    vadd.s16      q14, q14, q3          @adding offset for row 2L
610    vrshl.s16     q8, q8, q0            @rounds off the weighted samples from row 3H
611    vadd.s16      q6, q6, q3            @adding offset for row 2H
612    vrshl.s16     q15, q15, q0          @rounds off the weighted samples from row 4L
613    vadd.s16      q13, q13, q3          @adding offset for row 3L
614    vrshl.s16     q10, q10, q0          @rounds off the weighted samples from row 4H
615    vadd.s16      q8, q8, q3            @adding offset for row 3H
616
617    vqmovun.s16   d10, q12              @saturating row 1L to unsigned 8-bit
618    vadd.s16      q15, q15, q3          @adding offset for row 4L
619    vqmovun.s16   d11, q4               @saturating row 1H to unsigned 8-bit
620    vadd.s16      q10, q10, q3          @adding offset for row 4H
621
622    vqmovun.s16   d18, q14              @saturating row 2L to unsigned 8-bit
623    vqmovun.s16   d19, q6               @saturating row 2H to unsigned 8-bit
624    vqmovun.s16   d14, q13              @saturating row 3L to unsigned 8-bit
625    vqmovun.s16   d15, q8               @saturating row 3H to unsigned 8-bit
626    vst1.8        {q5}, [r2], r5        @store row 1 in destination
627    vqmovun.s16   d22, q15              @saturating row 4L to unsigned 8-bit
628    vqmovun.s16   d23, q10              @saturating row 4H to unsigned 8-bit
629
630    vst1.8        {q9}, [r2], r5        @store row 2 in destination
631    subs          r11, r11, #4          @decrement ht by 4
632    vst1.8        {q7}, [r2], r5        @store row 3 in destination
633    vst1.8        {q11}, [r2], r5       @store row 4 in destination
634
635    bgt           loop_8_uv             @if greater than 0 repeat the loop again
636
637end_loops_uv:
638
639    vpop          {d8-d15}
640    ldmfd         sp!, {r4-r12, r15}    @Reload the registers from sp
641
642
643