1@/*****************************************************************************
2@*
3@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4@*
5@* Licensed under the Apache License, Version 2.0 (the "License");
6@* you may not use this file except in compliance with the License.
7@* You may obtain a copy of the License at:
8@*
9@* http://www.apache.org/licenses/LICENSE-2.0
10@*
11@* Unless required by applicable law or agreed to in writing, software
12@* distributed under the License is distributed on an "AS IS" BASIS,
13@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14@* See the License for the specific language governing permissions and
15@* limitations under the License.
16@*
17@*****************************************************************************/
18@/**
19@*******************************************************************************
20@* @file
21@*  ihevc_weighted_pred_bi_default.s
22@*
23@* @brief
24@*  contains function definitions for weighted prediction used in inter
25@* prediction
26@*
27@* @author
28@*  parthiban v
29@*
30@* @par list of functions:
31@*  - ihevc_weighted_pred_bi_default()
32@*
33@* @remarks
34@*  none
35@*
36@*******************************************************************************
37@*/
38@/**
39@*******************************************************************************
40@*
41@* @brief
42@*  does default bi-weighted prediction on the arrays pointed by pi2_src1 and
43@* pi2_src2 and stores it at location  pointed by pi2_dst assumptions : the
44@* function is optimized considering the fact width and  height are multiple
45@* of 2.
46@*
47@* @par description:
48@*  dst = ( (src1 + lvl_shift1) +  (src2 + lvl_shift2) +  1 << (shift - 1) )
49@* >> shift  where shift = 15 - bitdepth
50@*
51@* @param[in] pi2_src1
52@*  pointer to source 1
53@*
54@* @param[in] pi2_src2
55@*  pointer to source 2
56@*
57@* @param[out] pu1_dst
58@*  pointer to destination
59@*
60@* @param[in] src_strd1
61@*  source stride 1
62@*
63@* @param[in] src_strd2
64@*  source stride 2
65@*
66@* @param[in] dst_strd
67@*  destination stride
68@*
69@* @param[in] lvl_shift1
70@*  added before shift and offset
71@*
72@* @param[in] lvl_shift2
73@*  added before shift and offset
74@*
75@* @param[in] ht
76@*  height of the source
77@*
78@* @param[in] wd
79@*  width of the source
80@*
81@* @returns
82@*
83@* @remarks
84@*  none
85@*
86@*******************************************************************************
87@*/
88@void ihevc_weighted_pred_bi_default(word16 *pi2_src1,
89@                                    word16 *pi2_src2,
90@                                    uword8 *pu1_dst,
91@                                    word32 src_strd1,
92@                                    word32 src_strd2,
93@                                    word32 dst_strd,
94@                                    word32 lvl_shift1,
95@                                    word32 lvl_shift2,
96@                                    word32 ht,
97@                                    word32 wd)
98
99@**************variables vs registers*****************************************
100@   r0 => *pi2_src1
101@   r1 => *pi2_src2
102@   r2 => *pu1_dst
103@   r3 =>  src_strd1
104@   r4 =>  src_strd2
105@   r5 =>  dst_strd
106@   r6 =>  lvl_shift1
107@   r7 =>  lvl_shift2
108@   r8 =>  ht
109@   r9 =>  wd
110.text
111.syntax unified
112.align 4
113
114
115
116
117.globl ihevc_weighted_pred_bi_default_a9q
118
119.type ihevc_weighted_pred_bi_default_a9q, %function
120
121ihevc_weighted_pred_bi_default_a9q:
122
123    stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
124    ldr         r4,[sp,#40]                 @load src_strd2
125    lsl         r3,r3,#1
126    ldr         r5,[sp,#44]                 @load dst_strd
127    ldr         r6,[sp,#48]                 @load lvl_shift1
128    lsl         r4,r4,#1
129    ldr         r7,[sp,#52]                 @load lvl_shift2
130    ldr         r8,[sp,#56]                 @load ht
131    ldr         r9,[sp,#60]                 @load wd
132    vdup.16     q2,r6                       @lvl_shift1_t = vmov_n_s16((int16_t)lvl_shift1)
133    vdup.16     q3,r7                       @lvl_shift2_t = vmov_n_s16((int16_t)lvl_shift2)
134    vmov.i16    q0,#0x40                    @tmp_lvl_shift = 1 << (shift - 1)
135    vadd.i16    q2,q3
136    vadd.s16    q0,q0,q2
137@   vmvn.i32    q1,#0x6                         @vmovq_n_s32(tmp_shift)
138    lsl         r6,r9,#1
139    rsb         r7,r6,r3,lsl #2             @4*src_strd1 - wd
140    rsb         r10,r6,r4,lsl #2            @4*src_strd2 - wd
141    @asr            r6,#1
142    @rsb            r6,r6,r5,lsl #2             @4*dst_strd - wd
143
144    cmp         r8,#0                       @check ht == 0
145    beq         end_loops                   @if equal, then end the function
146
147chroma_decision:
148    orr         r14,r8,r9
149    cmp         r14,#10
150    beq         outer_loop_chroma_8x2
151
152    cmp         r14,#6
153    beq         outer_loop_chroma_4x2
154
155
156luma_decision:
157    cmp         r9,#24
158    beq         outer_loop_8
159
160    cmp         r9,#16
161    bge         outer_loop_16
162
163    cmp         r9,#12
164    beq         outer_loop_4
165
166    cmp         r9,#8
167    bge         outer_loop_8
168
169
170
171
172
173
174outer_loop_4:
175    cmp         r9,#0                       @check wd == 0
176    beq         end_loops                   @if equal, then end the function
177
178core_loop_4:
179    add         r11,r0,r3                   @pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
180    add         r12,r1,r4                   @pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)
181    vld1.s16    {d6},[r0]!                  @load and increment the pi2_src1
182    add         r14,r2,r5                   @pu1_dst_tmp = pu1_dst + dst_strd
183    vld1.s16    {d7},[r1]!                  @load and increment the pi2_src2
184    vld1.s16    {d8},[r11],r3               @load and increment the pi2_src1 ii iteration
185    vqadd.s16   d18,d6,d7
186    vqadd.s16   d18,d18,d0                  @vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t)
187    vld1.s16    {d9},[r12],r4               @load and increment the pi2_src2 ii iteration
188    vqadd.s16   d20,d8,d9                   @vaddq_s32(i4_tmp2_t1, i4_tmp2_t2)
189    vqadd.s16   d19,d20,d0                  @vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t)
190    vqshrun.s16 d20,q9,#7
191    vld1.s16    {d22},[r11],r3              @load and increment the pi2_src1 iii iteration
192    vld1.s16    {d23},[r12],r4              @load and increment the pi2_src2 iii iteration
193    vqadd.s16   d30,d22,d23
194    vqadd.s16   d30,d30,d0                  @vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t) iii iteration
195    vld1.s16    {d24},[r11],r3              @load and increment the pi2_src1 iv iteration
196    vld1.s16    {d25},[r12],r4              @load and increment the pi2_src2 iv iteration
197    vqadd.s16   d18,d24,d25                 @vaddq_s32(i4_tmp2_t1, i4_tmp2_t2) iv iteration
198    vqadd.s16   d31,d18,d0
199    vst1.32     {d20[0]},[r2]!              @store pu1_dst i iteration
200    vst1.32     {d20[1]},[r14],r5           @store pu1_dst ii iteration
201    vqshrun.s16 d30,q15,#7
202    vst1.32     {d30[0]},[r14],r5           @store pu1_dst iii iteration                                                @vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t) iv iteratio
203    subs        r9,r9,#4                    @decrement wd by 4 and check for 0
204    vst1.32     {d30[1]},[r14],r5           @store pu1_dst iv iteration
205    bgt         core_loop_4                 @if greater than 0 repeat the core loop again
206
207end_core_loop_4:
208
209    subs        r8,r8,#4                    @decrement the ht by 4
210
211    add         r0,r0,r7                    @pi2_src1 + 4*src_strd1 - 2*wd(since pi2_src1 is 16 bit pointer double the increment with double the wd decrement)
212    asr         r9,r6,#1
213    add         r1,r1,r10                   @pi2_src2 + 4*src_strd2 - 2*wd
214    rsb         r14,r9,r5,lsl #2            @4*dst_strd - wd
215    add         r2,r2,r14
216                                            @pu1_dst + dst_std - wd
217    bgt         core_loop_4                 @if ht is greater than 0 goto outer_loop
218
219    b           end_loops
220
221
222@ this is only for chroma module with input 2x2
223outer_loop_chroma_4x2:
224    cmp         r9,#0                       @check wd == 0
225    beq         end_loops                   @if equal, then end the function
226    rsb         r7,r6,r3,lsl #1             @2*src_strd1 - wd
227    rsb         r10,r6,r4,lsl #1            @2*src_strd2 - wd
228core_loop_chroma_4x2:
229    add         r11,r0,r3                   @pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
230    add         r12,r1,r4                   @pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)
231    vld1.s16    {d6},[r0]!                  @load and increment the pi2_src1
232    add         r14,r2,r5                   @pu1_dst_tmp = pu1_dst + dst_strd
233    vld1.s16    {d7},[r1]!                  @load and increment the pi2_src2
234    vld1.s16    {d8},[r11],r3               @load and increment the pi2_src1 ii iteration
235    vqadd.s16   d18,d6,d7
236    vqadd.s16   d18,d18,d0                  @vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t)
237    vld1.s16    {d9},[r12],r4               @load and increment the pi2_src2 ii iteration
238    vqadd.s16   d20,d8,d9                   @vaddq_s32(i4_tmp2_t1, i4_tmp2_t2)
239    vqadd.s16   d19,d20,d0                  @vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t)
240    vqshrun.s16 d20,q9,#7
241    vst1.32     {d20[0]},[r2]!              @store pu1_dst i iteration
242    vst1.32     {d20[1]},[r14],r5           @store pu1_dst ii iteration
243
244    subs        r9,r9,#4                    @decrement wd by 4 and check for 0
245
246    bgt         core_loop_chroma_4x2        @if greater than 0 repeat the core loop again
247
248end_core_loop_chorma_4x2:
249
250    subs        r8,r8,#2                    @decrement the ht by 4
251
252    add         r0,r0,r7                    @pi2_src1 + 2*src_strd1 - 2*wd(since pi2_src1 is 16 bit pointer double the increment with double the wd decrement)
253    asr         r9,r6,#1
254    add         r1,r1,r10                   @pi2_src2 + 2*src_strd2 - 2*wd
255    rsb         r14,r9,r5,lsl #1            @2*dst_strd - wd
256    add         r2,r2,r14
257                                            @pu1_dst + dst_std - wd
258    bgt         core_loop_chroma_4x2        @if ht is greater than 0 goto outer_loop
259
260    b           end_loops
261
262
263
264outer_loop_8:
265    cmp         r9,#0                       @check wd == 0
266    beq         end_loops                   @if equal, then end the function
267    add         r11,r0,r3                   @pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
268    add         r12,r1,r4                   @pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)
269core_loop_8:
270
271    vld1.s16    {q12},[r0]!                 @load and increment the pi2_src1
272    add         r14,r2,r5                   @pu1_dst_tmp = pu1_dst + dst_strd
273    vld1.s16    {q13},[r1]!                 @load and increment the pi2_src2
274    vqadd.s16   q12,q12,q13
275    vld1.s16    {q14},[r11],r3              @load and increment the pi2_src1 ii iteration
276    vqadd.s16   q12,q12,q0                  @vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t)
277    vld1.s16    {q15},[r12],r4              @load and increment the pi2_src2 ii iteration
278    vld1.s16    {q8},[r11],r3               @load and increment the pi2_src1 iii iteration
279    vqadd.s16   q11,q14,q15                 @vaddq_s32(i4_tmp2_t1, i4_tmp2_t2)
280    vld1.s16    {q9},[r12],r4               @load and increment the pi2_src2 iii iteration
281    vqadd.s16   q11,q11,q0                  @vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t)
282    vqshrun.s16 d20,q12,#7
283    vld1.s16    {q6},[r11],r3               @load and increment the pi2_src1 iv iteration
284    vqadd.s16   q15,q8,q9
285    vqshrun.s16 d21,q11,#7
286    vld1.s16    {q7},[r12],r4               @load and increment the pi2_src2 iv iteration
287    vqadd.s16   q15,q15,q0                  @vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t) iii iteration
288    vst1.32     {d20},[r2]!                 @store pu1_dst i iteration
289    vqadd.s16   q4,q6,q7                    @vaddq_s32(i4_tmp2_t1, i4_tmp2_t2) iv iteration
290    vst1.32     {d21},[r14],r5              @store pu1_dst ii iteration
291    vqadd.s16   q4,q4,q0
292    vqshrun.s16 d30,q15,#7
293    vqshrun.s16 d31,q4,#7
294    add         r11,r0,r3                   @pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
295    add         r12,r1,r4                   @pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)
296    vst1.32     {d30},[r14],r5              @store pu1_dst iii iteration                                                @vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t) iv iteratio
297    subs        r9,r9,#8                    @decrement wd by 4 and check for 0
298    vst1.32     {d31},[r14],r5              @store pu1_dst iv iteration
299    bgt         core_loop_8                 @if greater than 0 repeat the core loop again
300
301end_core_loop_8:
302
303    subs        r8,r8,#4                    @decrement the ht by 4
304
305    add         r0,r0,r7                    @pi2_src1 + 4*src_strd1 - 2*wd(since pi2_src1 is 16 bit pointer double the increment with double the wd decrement)
306    asr         r9,r6,#1
307    add         r1,r1,r10                   @pi2_src2 + 4*src_strd2 - 2*wd
308    rsb         r14,r9,r5,lsl #2            @4*dst_strd - wd
309    add         r2,r2,r14
310    add         r11,r0,r3                   @pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
311    add         r12,r1,r4                   @pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)                                  @pu1_dst + dst_std - wd
312
313    bgt         core_loop_8
314    b           end_loops
315
316
317
318@ this is only for chroma module with inpput 4x2
319outer_loop_chroma_8x2:
320    cmp         r9,#0                       @check wd == 0
321    beq         end_loops                   @if equal, then end the function
322    add         r11,r0,r3                   @pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
323    add         r12,r1,r4                   @pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)
324    rsb         r7,r6,r3,lsl #1             @2*src_strd1 - wd
325    rsb         r10,r6,r4,lsl #1            @2*src_strd2 - wd
326core_loop_chroma_8x2:
327
328    vld1.s16    {q12},[r0]!                 @load and increment the pi2_src1
329    add         r14,r2,r5                   @pu1_dst_tmp = pu1_dst + dst_strd
330    vld1.s16    {q13},[r1]!                 @load and increment the pi2_src2
331    vqadd.s16   q12,q12,q13
332    vld1.s16    {q14},[r11],r3              @load and increment the pi2_src1 ii iteration
333    vqadd.s16   q12,q12,q0                  @vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t)
334    vld1.s16    {q15},[r12],r4              @load and increment the pi2_src2 ii iteration
335    vld1.s16    {q8},[r11],r3               @load and increment the pi2_src1 iii iteration
336    vqadd.s16   q11,q14,q15                 @vaddq_s32(i4_tmp2_t1, i4_tmp2_t2)
337    vqadd.s16   q11,q11,q0                  @vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t)
338    vqshrun.s16 d20,q12,#7
339    vqshrun.s16 d21,q11,#7
340    vst1.32     {d20},[r2]!                 @store pu1_dst i iteration
341    vst1.32     {d21},[r14],r5              @store pu1_dst ii iteration
342
343    add         r11,r0,r3                   @pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
344    add         r12,r1,r4                   @pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)
345                                            @vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t) iv iteratio
346    subs        r9,r9,#8                    @decrement wd by 4 and check for 0
347
348    bgt         core_loop_chroma_8x2        @if greater than 0 repeat the core loop again
349
350end_core_loop_chroma_8x2:
351
352    subs        r8,r8,#2                    @decrement the ht by 4
353
354    add         r0,r0,r7                    @pi2_src1 + 4*src_strd1 - 2*wd(since pi2_src1 is 16 bit pointer double the increment with double the wd decrement)
355    asr         r9,r6,#1
356    add         r1,r1,r10                   @pi2_src2 + 4*src_strd2 - 2*wd
357    rsb         r14,r9,r5,lsl #1            @4*dst_strd - wd
358    add         r2,r2,r14
359    add         r11,r0,r3                   @pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
360    add         r12,r1,r4                   @pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)                                  @pu1_dst + dst_std - wd
361
362    bgt         core_loop_chroma_8x2
363
364    b           end_loops
365
366
367
368
369outer_loop_16:
370    cmp         r9,#0                       @check wd == 0
371    beq         end_loops                   @if equal, then end the function
372    add         r11,r0,r3                   @pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
373    add         r12,r1,r4                   @pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)
374    rsb         r7,r6,r3,lsl #1             @2*src_strd1 - wd
375    mov         r14,#16
376    sub         r10,r14,r5
377    sub         r11,r3,r14
378    sub         r12,r14,r3
379
380    rsb         r14,r9,r5,lsl #1            @2*dst_strd - wd
381
382
383
384prolog_16:
385
386
387    vld1.s16    {q1},[r0]!                  @load and increment the pi2_src1
388    vld1.s16    {q2},[r1]!                  @load and increment the pi2_src2
389    vld1.s16    {q5},[r0],r11               @load and increment the pi2_src1
390    vld1.s16    {q6},[r1],r11               @load and increment the pi2_src2
391    vld1.s16    {q3},[r0]!                  @load and increment the pi2_src1 ii iteration
392    subs        r9,r9,#16
393    vld1.s16    {q4},[r1]!                  @load and increment the pi2_src2 ii iteration
394    subeq       r8,r8,#2
395    vqadd.s16   q11,q1,q2
396    vld1.s16    {q7},[r0],r12               @load and increment the pi2_src1 ii iteration
397    vqadd.s16   q14,q5,q6
398    vld1.s16    {q8},[r1],r12               @load and increment the pi2_src2 ii iteration
399    addeq       r0,r0,r7
400    addeq       r1,r1,r7
401    vqadd.s16   q12,q3,q4
402    vld1.s16    {q1},[r0]!
403    vqadd.s16   q13,q7,q8
404@ if the input is chroma with 8x2 block size
405    cmp         r8,#0
406    beq         epilog_16
407
408    vld1.s16    {q2},[r1]!                  @load and increment the pi2_src2
409    vqadd.s16   q11,q11,q0
410    vld1.s16    {q5},[r0],r11               @load and increment the pi2_src1
411    vqadd.s16   q14,q14,q0
412    vld1.s16    {q6},[r1],r11               @load and increment the pi2_src2
413    vqadd.s16   q12,q12,q0
414    vld1.s16    {q3},[r0]!                  @load and increment the pi2_src1 ii iteration
415    vqadd.s16   q15,q13,q0
416    vqshrun.s16 d20,q11,#7
417    vld1.s16    {q4},[r1]!                  @load and increment the pi2_src2 ii iteration
418    vqshrun.s16 d21,q14,#7
419    vld1.s16    {q7},[r0],r12               @load and increment the pi2_src1 ii iteration
420    vqshrun.s16 d26,q12,#7
421    vld1.s16    {q8},[r1],r12               @load and increment the pi2_src2 ii iteration
422    vqshrun.s16 d27,q15,#7
423
424
425
426core_loop_16:
427
428    cmp         r9,#0
429    vqadd.s16   q11,q1,q2
430    asreq       r9,r6,#1
431    vst1.32     {q10},[r2],r5
432    vqadd.s16   q14,q5,q6
433    vst1.32     {q13},[r2],r10
434    addeq       r2,r2,r14
435    vqadd.s16   q12,q3,q4
436    subs        r9,r9,#16
437    addeq       r0,r0,r7
438    vqadd.s16   q13,q7,q8
439
440    addeq       r1,r1,r7
441    subseq      r8,r8,#2                    @decrement the ht by 2
442    beq         epilog_16
443
444
445    vqadd.s16   q11,q11,q0
446    vld1.s16    {q1},[r0]!                  @load and increment the pi2_src1
447    vqadd.s16   q14,q14,q0
448    vld1.s16    {q2},[r1]!                  @load and increment the pi2_src2
449    vqadd.s16   q12,q12,q0
450    vld1.s16    {q5},[r0],r11               @load and increment the pi2_src1
451    vqadd.s16   q15,q13,q0
452    vld1.s16    {q6},[r1],r11               @load and increment the pi2_src2
453    vqshrun.s16 d20,q11,#7
454    vld1.s16    {q3},[r0]!                  @load and increment the pi2_src1 ii iteration
455    vqshrun.s16 d21,q14,#7
456    vld1.s16    {q4},[r1]!                  @load and increment the pi2_src2 ii iteration
457    vqshrun.s16 d26,q12,#7
458    vld1.s16    {q7},[r0],r12               @load and increment the pi2_src1 ii iteration
459    vqshrun.s16 d27,q15,#7
460    vld1.s16    {q8},[r1],r12               @load and increment the pi2_src2 ii iteration
461
462
463    b           core_loop_16
464
465
466epilog_16:
467
468    vqadd.s16   q11,q11,q0
469    vqadd.s16   q14,q14,q0
470    vqadd.s16   q12,q12,q0
471    vqadd.s16   q15,q13,q0
472    vqshrun.s16 d20,q11,#7
473    vqshrun.s16 d21,q14,#7
474    vqshrun.s16 d26,q12,#7
475    vqshrun.s16 d27,q15,#7
476    vst1.32     {q10},[r2],r5
477    vst1.32     {q13},[r2]
478
479
480
481end_core_loop_16:
482
483
484
485
486
487
488
489
490end_loops:
491    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
492
493
494
495
496