1@/*****************************************************************************
2@*
3@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4@*
5@* Licensed under the Apache License, Version 2.0 (the "License");
6@* you may not use this file except in compliance with the License.
7@* You may obtain a copy of the License at:
8@*
9@* http://www.apache.org/licenses/LICENSE-2.0
10@*
11@* Unless required by applicable law or agreed to in writing, software
12@* distributed under the License is distributed on an "AS IS" BASIS,
13@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14@* See the License for the specific language governing permissions and
15@* limitations under the License.
16@*
17@*****************************************************************************/
18@/**
19@******************************************************************************
20@* @file
21@*  ihevc_inter_pred_filters_luma_vert_w16inp.s
22@*
23@* @brief
24@*  contains function definitions for inter prediction  interpolation.
25@* functions are coded using neon  intrinsics and can be compiled using
26
27@* rvct
28@*
29@* @author
30@*  yogeswaran rs
31@*
32@* @par list of functions:
33@*
34@*  - ihevc_inter_pred_luma_vert()
35@*
36@* @remarks
37@*  none
38@*
39@*******************************************************************************
40@*/
41
42@/* all the functions here are replicated from ihevc_inter_pred_filters.c and modified to */
43@/* include reconstruction */
44@
45
46@/**
47@*******************************************************************************
48@*
49@* @brief
50@*    luma vertical filter for 16bit input.
51@*
52@* @par description:
53@*     applies a vertical filter with coefficients pointed to  by 'pi1_coeff' to
54@*     the elements pointed by 'pu1_src' and  writes to the location pointed by
55@*    'pu1_dst'  input is 16 bits  the filter output is downshifted by 12 and
56@*     clipped to lie  between 0 and 255   assumptions : the function is
57@*     optimized considering the fact width is  multiple of 4. and height as
58@*     multiple of 2.
59@*
60@* @param[in] pi2_src
61@*  word16 pointer to the source
62@*
63@* @param[out] pu1_dst
64@*  uword8 pointer to the destination
65@*
66@* @param[in] src_strd
67@*  integer source stride
68@*
69@* @param[in] dst_strd
70@*  integer destination stride
71@*
72@* @param[in] pi1_coeff
73@*  word8 pointer to the filter coefficients
74@*
75@* @param[in] ht
76@*  integer height of the array
77@*
78@* @param[in] wd
79@*  integer width of the array
80@*
81@* @returns
82@*
83@* @remarks
84@*  none
85@*
86@*******************************************************************************
87@*/
88
89@void ihevc_inter_pred_luma_vert_w16inp(word16 *pi2_src,
90@                                    uword8 *pu1_dst,
91@                                    word32 src_strd,
92@                                    word32 dst_strd,
93@                                    word8 *pi1_coeff,
94@                                    word32 ht,
95@                                    word32 wd   )
96@**************variables vs registers*****************************************
97@   r0 => *pu2_src
98@   r1 => *pu1_dst
99@   r2 =>  src_strd
100@   r3 =>  dst_strd
101@   r4 => *pi1_coeff
102@   r5 =>  ht
103@   r6 =>  wd
104
105.text
106.align 4
107
108
109
110
111.globl ihevc_inter_pred_luma_vert_w16inp_w16out_a9q
112
113.type ihevc_inter_pred_luma_vert_w16inp_w16out_a9q, %function
114
115ihevc_inter_pred_luma_vert_w16inp_w16out_a9q:
116
117    stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
118
119    ldr         r12,[sp,#40]                @load pi1_coeff
120    mov         r6,r3,lsl #1
121    ldr         r5,[sp,#48]                 @load wd
122    vld1.8      {d0},[r12]                  @coeff = vld1_s8(pi1_coeff)
123    mov         r2, r2, lsl #1
124    sub         r12,r2,r2,lsl #2            @src_ctrd & pi1_coeff
125    @vabs.s8    d0,d0               @vabs_s8(coeff)
126    add         r0,r0,r12                   @r0->pu1_src    r12->pi1_coeff
127    ldr         r3,[sp,#44]                 @load ht
128    subs        r7,r3,#0                    @r3->ht
129    @ble        end_loops           @end loop jump
130    vmovl.s8    q0,d0
131    vdup.16     d22,d0[0]                   @coeffabs_0 = vdup_lane_u8(coeffabs, 0)@
132    vdup.16     d23,d0[1]                   @coeffabs_1 = vdup_lane_u8(coeffabs, 1)@
133    vdup.16     d24,d0[2]                   @coeffabs_2 = vdup_lane_u8(coeffabs, 2)@
134    vdup.16     d25,d0[3]                   @coeffabs_3 = vdup_lane_u8(coeffabs, 3)@
135    vdup.16     d26,d1[0]                   @coeffabs_4 = vdup_lane_u8(coeffabs, 4)@
136    vdup.16     d27,d1[1]                   @coeffabs_5 = vdup_lane_u8(coeffabs, 5)@
137    vdup.16     d28,d1[2]                   @coeffabs_6 = vdup_lane_u8(coeffabs, 6)@
138    vdup.16     d29,d1[3]                   @coeffabs_7 = vdup_lane_u8(coeffabs, 7)@
139    vmov.i32    q15,#0x80000
140
141    rsb         r9,r5,r6,lsl #2             @r6->dst_strd   r5  ->wd
142    rsb         r8,r5,r2,lsl #2             @r2->src_strd
143    sub         r8,r8,r5
144    sub         r9,r9,r5
145    mov         r3, r5, lsr #2              @divide by 4
146    mul         r7, r3                      @multiply height by width
147    sub         r7, #4                      @subtract by one for epilog
148    mov         r4,r5                       @r5 ->wd
149    @mov            r2, r2, lsl #1
150
151prolog:
152
153    add         r3,r0,r2                    @pu1_src_tmp += src_strd@
154    vld1.16     {d1},[r3],r2                @src_tmp2 = vld1_u8(pu1_src_tmp)@
155    vld1.16     {d0},[r0]!                  @src_tmp1 = vld1_u8(pu1_src_tmp)@
156    subs        r4,r4,#4
157    vld1.16     {d2},[r3],r2                @src_tmp3 = vld1_u8(pu1_src_tmp)@
158    vmull.s16   q4,d1,d23                   @mul_res1 = vmull_u8(src_tmp2, coeffabs_1)@
159    vld1.16     {d3},[r3],r2                @src_tmp4 = vld1_u8(pu1_src_tmp)@
160    vmlal.s16   q4,d0,d22                   @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_0)@
161    vld1.16     {d4},[r3],r2                @src_tmp1 = vld1_u8(pu1_src_tmp)@
162    vmlal.s16   q4,d2,d24                   @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_2)@
163    vld1.16     {d5},[r3],r2                @src_tmp2 = vld1_u8(pu1_src_tmp)@
164    vmlal.s16   q4,d3,d25                   @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)@
165    vld1.16     {d6},[r3],r2                @src_tmp3 = vld1_u8(pu1_src_tmp)@
166    vmlal.s16   q4,d4,d26                   @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)@
167    vld1.16     {d7},[r3],r2                @src_tmp4 = vld1_u8(pu1_src_tmp)@
168    vmlal.s16   q4,d5,d27                   @mul_res1 = vmlal_u8(mul_res1, src_tmp2, coeffabs_5)@
169    vmlal.s16   q4,d6,d28                   @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)@
170    vmlal.s16   q4,d7,d29                   @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_7)@
171
172    vld1.16     {d16},[r3],r2               @src_tmp1 = vld1_u8(pu1_src_tmp)@
173
174    vmull.s16   q5,d2,d23                   @mul_res2 = vmull_u8(src_tmp3, coeffabs_1)@
175    addle       r0,r0,r8,lsl #0
176    vmlal.s16   q5,d1,d22                   @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_0)@
177    movle       r4,r5                       @r5 ->wd
178    vmlal.s16   q5,d3,d24                   @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_2)@
179    vld1.16     {d17},[r3],r2               @src_tmp2 = vld1_u8(pu1_src_tmp)@
180    vmlal.s16   q5,d4,d25                   @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)@
181    vld1.16     {d18},[r3],r2               @src_tmp3 = vld1_u8(pu1_src_tmp)@
182    vmlal.s16   q5,d5,d26                   @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)@
183    add         r3,r0,r2                    @pu1_src_tmp += src_strd@
184    vmlal.s16   q5,d6,d27                   @mul_res2 = vmlal_u8(mul_res2, src_tmp3, coeffabs_5)@
185    vmlal.s16   q5,d7,d28                   @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)@
186    vmlal.s16   q5,d16,d29                  @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_7)@
187    vsub.s32    q4, q4, q15
188
189    vld1.16     {d1},[r3],r2                @src_tmp3 = vld1_u8(pu1_src_tmp)@
190    vmull.s16   q6,d3,d23
191    vld1.16     {d0},[r0]!                  @src_tmp1 = vld1_u8(pu1_src_tmp)@
192    vmlal.s16   q6,d2,d22
193    vld1.16     {d2},[r3],r2                @src_tmp3 = vld1_u8(pu1_src_tmp)@
194    vmlal.s16   q6,d4,d24
195    vmlal.s16   q6,d5,d25
196    vmlal.s16   q6,d6,d26
197    vmlal.s16   q6,d7,d27
198    vmlal.s16   q6,d16,d28
199    vmlal.s16   q6,d17,d29
200    add         r14,r1,r6
201    vsub.s32    q5, q5, q15
202    vshrn.s32   d8, q4, #6
203    @vqrshrun.s16 d8,q4,#6          @sto_res = vqmovun_s16(sto_res_tmp)@
204
205    vmull.s16   q7,d4,d23
206    vmlal.s16   q7,d3,d22
207    vmlal.s16   q7,d5,d24
208    vmlal.s16   q7,d6,d25
209    vld1.16     {d3},[r3],r2                @src_tmp4 = vld1_u8(pu1_src_tmp)@
210    vmlal.s16   q7,d7,d26
211    vld1.16     {d4},[r3],r2                @src_tmp1 = vld1_u8(pu1_src_tmp)@
212    vmlal.s16   q7,d16,d27
213    vld1.16     {d5},[r3],r2                @src_tmp2 = vld1_u8(pu1_src_tmp)@
214    vmlal.s16   q7,d17,d28
215    vld1.16     {d6},[r3],r2                @src_tmp3 = vld1_u8(pu1_src_tmp)@
216    vmlal.s16   q7,d18,d29
217    vld1.16     {d7},[r3],r2                @src_tmp4 = vld1_u8(pu1_src_tmp)@
218
219    vst1.32     {d8},[r1]!                  @vst1_u8(pu1_dst,sto_res)@
220    vsub.s32    q6, q6, q15
221    vshrn.s32   d10, q5, #6
222    @vqrshrun.s16 d10,q5,#6         @sto_res = vqmovun_s16(sto_res_tmp)@
223    addle       r1,r1,r9
224
225    subs        r7,r7,#4
226
227
228    blt         epilog_end                  @jumps to epilog_end
229    beq         epilog                      @jumps to epilog
230
231kernel_8:
232
233    vmull.s16   q4,d1,d23                   @mul_res1 = vmull_u8(src_tmp2, coeffabs_1)@
234    subs        r4,r4,#4
235    vmlal.s16   q4,d0,d22                   @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_0)@
236    addle       r0,r0,r8,lsl #0
237    vmlal.s16   q4,d2,d24                   @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_2)@
238    vmlal.s16   q4,d3,d25                   @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)@
239    vmlal.s16   q4,d4,d26                   @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)@
240    vmlal.s16   q4,d5,d27                   @mul_res1 = vmlal_u8(mul_res1, src_tmp2, coeffabs_5)@
241    vmlal.s16   q4,d6,d28                   @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)@
242    vmlal.s16   q4,d7,d29                   @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_7)@
243    vst1.32     {d10},[r14],r6              @vst1_u8(pu1_dst_tmp,sto_res)@
244
245    vsub.s32    q7, q7, q15
246    vshrn.s32   d12, q6, #6
247    @vqrshrun.s16 d12,q6,#6
248    vld1.16     {d16},[r3],r2               @src_tmp1 = vld1_u8(pu1_src_tmp)@
249
250    vmull.s16   q5,d2,d23                   @mul_res2 = vmull_u8(src_tmp3, coeffabs_1)@
251    vmlal.s16   q5,d1,d22                   @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_0)@
252    vmlal.s16   q5,d3,d24                   @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_2)@
253    vmlal.s16   q5,d4,d25                   @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)@
254    vmlal.s16   q5,d5,d26                   @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)@
255    vmlal.s16   q5,d6,d27                   @mul_res2 = vmlal_u8(mul_res2, src_tmp3, coeffabs_5)@
256    vst1.32     {d12},[r14],r6
257
258    vmlal.s16   q5,d7,d28                   @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)@
259    vld1.16     {d17},[r3],r2               @src_tmp2 = vld1_u8(pu1_src_tmp)@
260
261    vmlal.s16   q5,d16,d29                  @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_7)@
262
263    vsub.s32    q4, q4, q15
264    vshrn.s32   d14, q7, #6
265    @vqrshrun.s16 d14,q7,#6
266
267    vmull.s16   q6,d3,d23
268    movle       r4,r5                       @r5 ->wd
269
270    vmlal.s16   q6,d2,d22
271    vld1.16     {d18},[r3],r2               @src_tmp3 = vld1_u8(pu1_src_tmp)@
272
273    vmlal.s16   q6,d4,d24
274    add         r3,r0,r2                    @pu1_src_tmp += src_strd@
275
276    vmlal.s16   q6,d5,d25
277
278    vmlal.s16   q6,d6,d26
279    vst1.32     {d14},[r14],r6
280
281    vmlal.s16   q6,d7,d27
282    vld1.16     {d1},[r3],r2                @src_tmp2 = vld1_u8(pu1_src_tmp)@
283
284    vmlal.s16   q6,d16,d28
285    add         r14,r1,r6
286
287    vmlal.s16   q6,d17,d29
288    vld1.16     {d0},[r0]!                  @src_tmp1 = vld1_u8(pu1_src_tmp)@
289
290    vsub.s32    q5, q5, q15
291    vshrn.s32   d8, q4, #6
292    @vqrshrun.s16 d8,q4,#6          @sto_res = vqmovun_s16(sto_res_tmp)@
293    vld1.16     {d2},[r3],r2                @src_tmp3 = vld1_u8(pu1_src_tmp)@
294
295    vmull.s16   q7,d4,d23
296    vmlal.s16   q7,d3,d22
297    vmlal.s16   q7,d5,d24
298    vld1.16     {d3},[r3],r2                @src_tmp4 = vld1_u8(pu1_src_tmp)@
299
300    vmlal.s16   q7,d6,d25
301    vld1.16     {d4},[r3],r2                @src_tmp1 = vld1_u8(pu1_src_tmp)@
302    vmlal.s16   q7,d7,d26
303    vld1.16     {d5},[r3],r2                @src_tmp2 = vld1_u8(pu1_src_tmp)@
304    vmlal.s16   q7,d16,d27
305    vld1.16     {d6},[r3],r2                @src_tmp3 = vld1_u8(pu1_src_tmp)@
306    vmlal.s16   q7,d17,d28
307    vld1.16     {d7},[r3],r2                @src_tmp4 = vld1_u8(pu1_src_tmp)@
308    vmlal.s16   q7,d18,d29
309    vst1.32     {d8},[r1]!                  @vst1_u8(pu1_dst,sto_res)@
310
311    vsub.s32    q6, q6, q15
312    vshrn.s32   d10, q5, #6
313    addle       r1,r1,r9
314
315    @vqrshrun.s16 d10,q5,#6         @sto_res = vqmovun_s16(sto_res_tmp)@
316    subs        r7,r7,#4
317
318    bgt         kernel_8                    @jumps to kernel_8
319
320epilog:
321
322    vmull.s16   q4,d1,d23                   @mul_res1 = vmull_u8(src_tmp2, coeffabs_1)@
323    vmlal.s16   q4,d0,d22                   @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_0)@
324    vmlal.s16   q4,d2,d24                   @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_2)@
325    vmlal.s16   q4,d3,d25                   @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)@
326    vmlal.s16   q4,d4,d26                   @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)@
327    vmlal.s16   q4,d5,d27                   @mul_res1 = vmlal_u8(mul_res1, src_tmp2, coeffabs_5)@
328    vmlal.s16   q4,d6,d28                   @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)@
329    vmlal.s16   q4,d7,d29                   @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_7)@
330    vst1.32     {d10},[r14],r6
331
332    vsub.s32    q7, q7, q15
333    vshrn.s32   d12, q6, #6
334    @vqrshrun.s16 d12,q6,#6
335
336    vld1.16     {d16},[r3],r2               @src_tmp1 = vld1_u8(pu1_src_tmp)@
337    vmull.s16   q5,d2,d23                   @mul_res2 = vmull_u8(src_tmp3, coeffabs_1)@
338    vmlal.s16   q5,d1,d22                   @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_0)@
339    vmlal.s16   q5,d3,d24                   @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_2)@
340    vmlal.s16   q5,d4,d25                   @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)@
341    vmlal.s16   q5,d5,d26                   @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)@
342    vmlal.s16   q5,d6,d27                   @mul_res2 = vmlal_u8(mul_res2, src_tmp3, coeffabs_5)@
343    vmlal.s16   q5,d7,d28                   @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)@
344    vmlal.s16   q5,d16,d29                  @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_7)@
345    vst1.32     {d12},[r14],r6
346
347    vsub.s32    q4, q4, q15
348    vshrn.s32   d14, q7, #6
349    @vqrshrun.s16 d14,q7,#6
350
351    vld1.16     {d17},[r3],r2               @src_tmp2 = vld1_u8(pu1_src_tmp)@
352    vmull.s16   q6,d3,d23
353    vmlal.s16   q6,d2,d22
354    vmlal.s16   q6,d4,d24
355    vmlal.s16   q6,d5,d25
356    vmlal.s16   q6,d6,d26
357    vmlal.s16   q6,d7,d27
358    vmlal.s16   q6,d16,d28
359    vmlal.s16   q6,d17,d29
360    vst1.32     {d14},[r14],r6
361    vsub.s32    q5, q5, q15
362    vshrn.s32   d8, q4, #6
363    @vqrshrun.s16 d8,q4,#6          @sto_res = vqmovun_s16(sto_res_tmp)@
364
365    vld1.16     {d18},[r3],r2               @src_tmp3 = vld1_u8(pu1_src_tmp)@
366    vmull.s16   q7,d4,d23
367    vmlal.s16   q7,d3,d22
368    vmlal.s16   q7,d5,d24
369    vmlal.s16   q7,d6,d25
370    vmlal.s16   q7,d7,d26
371    vmlal.s16   q7,d16,d27
372    vmlal.s16   q7,d17,d28
373    vmlal.s16   q7,d18,d29
374    vsub.s32    q6, q6, q15
375    vshrn.s32   d10, q5, #6
376    @vqrshrun.s16 d10,q5,#6         @sto_res = vqmovun_s16(sto_res_tmp)@
377
378    add         r14,r1,r6
379    vst1.32     {d8},[r1]!                  @vst1_u8(pu1_dst,sto_res)@
380
381epilog_end:
382    vst1.32     {d10},[r14],r6              @vst1_u8(pu1_dst_tmp,sto_res)@
383    vshrn.s32   d12, q6, #6
384    @vqrshrun.s16 d12,q6,#6
385
386    vst1.32     {d12},[r14],r6
387    vsub.s32    q7, q7, q15
388    vshrn.s32   d14, q7, #6
389    @vqrshrun.s16 d14,q7,#6
390
391    vst1.32     {d14},[r14],r6
392
393
394end_loops:
395
396    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
397
398
399
400
401
402
403
404
405