1@/*****************************************************************************
2@*
3@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4@*
5@* Licensed under the Apache License, Version 2.0 (the "License");
6@* you may not use this file except in compliance with the License.
7@* You may obtain a copy of the License at:
8@*
9@* http://www.apache.org/licenses/LICENSE-2.0
10@*
11@* Unless required by applicable law or agreed to in writing, software
12@* distributed under the License is distributed on an "AS IS" BASIS,
13@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14@* See the License for the specific language governing permissions and
15@* limitations under the License.
16@*
17@*****************************************************************************/
18@/**
19@*******************************************************************************
20@* @file
21@*  ihevc_inter_pred_chroma_horz_neon.s
22@*
23@* @brief
24@*  contains function definitions for inter prediction  interpolation.
25@* functions are coded using neon  intrinsics and can be compiled using
26
27@* rvct
28@*
29@* @author
30@*  yogeswaran rs / akshaya mukund
31@*
32@* @par list of functions:
33@*
34@*
35@* @remarks
36@*  none
37@*
38@*******************************************************************************
39@*/
40@/**
41@*******************************************************************************
42@*
43@* @brief
44@*    chroma interprediction filter for horizontal input
45@*
46@* @par description:
47@*    applies a horizontal filter with coefficients pointed to  by 'pi1_coeff'
48@*    to the elements pointed by 'pu1_src' and  writes to the location pointed
49@*    by 'pu1_dst'  the output is downshifted by 6 and clipped to 8 bits
50@*    assumptions : the function is optimized considering the fact width is
51@*    multiple of 2,4 or 8. if width is 2, then height  should be multiple of 2.
52@*    width 4,8 is optimized further
53@*
54@* @param[in] pu1_src
55@*  uword8 pointer to the source
56@*
57@* @param[out] pu1_dst
58@*  uword8 pointer to the destination
59@*
60@* @param[in] src_strd
61@*  integer source stride
62@*
63@* @param[in] dst_strd
64@*  integer destination stride
65@*
66@* @param[in] pi1_coeff
67@*  word8 pointer to the filter coefficients
68@*
69@* @param[in] ht
70@*  integer height of the array
71@*
72@* @param[in] wd
73@*  integer width of the array
74@*
75@* @returns
76@*
77@* @remarks
78@*  none
79@*
80@*******************************************************************************
81@*/
82
83@void ihevc_inter_pred_chroma_horz(uword8 *pu1_src,
84@                                   uword8 *pu1_dst,
85@                                   word32 src_strd,
86@                                   word32 dst_strd,
87@                                   word8 *pi1_coeff,
88@                                   word32 ht,
89@                                   word32 wd)
90@**************variables vs registers*****************************************
91@r0 => *pu1_src
92@r1 => *pi2_dst
93@r2 =>  src_strd
94@r3 =>  dst_strd
95
96.text
97.align 4
98
99
100
101
102.globl ihevc_inter_pred_chroma_horz_a9q
103
104.type ihevc_inter_pred_chroma_horz_a9q, %function
105
106ihevc_inter_pred_chroma_horz_a9q:
107
108    stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
109
110    ldr         r4,[sp,#40]                 @loads pi1_coeff
111    ldr         r7,[sp,#44]                 @loads ht
112    ldr         r10,[sp,#48]                @loads wd
113
114    vld1.8      {d0},[r4]                   @coeff = vld1_s8(pi1_coeff)
115    subs        r14,r7,#0                   @checks for ht == 0
116    vabs.s8     d2,d0                       @vabs_s8(coeff)
117    mov         r11,#2
118    ble         end_loops
119
120    vdup.8      d24,d2[0]                   @coeffabs_0 = vdup_lane_u8(coeffabs, 0)
121    sub         r12,r0,#2                   @pu1_src - 2
122    vdup.8      d25,d2[1]                   @coeffabs_1 = vdup_lane_u8(coeffabs, 1)
123    add         r4,r12,r2                   @pu1_src_tmp2_8 = pu1_src + src_strd
124    vdup.8      d26,d2[2]                   @coeffabs_2 = vdup_lane_u8(coeffabs, 2)
125
126    tst         r10,#3                      @checks wd for multiples
127    mov         r5,r10,lsl #1
128
129    vdup.8      d27,d2[3]                   @coeffabs_3 = vdup_lane_u8(coeffabs, 3)
130
131    bne         outer_loop_4
132    cmp         r10,#12
133    beq         skip_16
134
135    cmp         r10,#8
136    bge         outer_loop_16
137skip_16:
138    tst         r7,#3
139
140    sub         r9,r0,#2
141    beq         outer_loop_ht_4             @jumps to else condition
142
143    b           outer_loop_8
144
145
146outer_loop_16:
147    mov         r10,r5                      @2wd
148    mul         r14,r14,r10
149
150    rsb         r6,r3,#16
151
152    add         r4,r12,r2
153    mov         r9,#10
154    and         r0, r12, #31
155    rsb         r8,r5,r3,lsl #1
156    pld         [r12, r2, lsl #1]
157
158
159
160
161    vld1.u32    {q0},[r12],r11              @vector load pu1_src
162    pld         [r4, r2, lsl #1]
163    vld1.u32    {q1},[r12],r11              @vector load pu1_src
164
165    vld1.u32    {q2},[r12],r11              @vector load pu1_src
166
167    vld1.u32    {q3},[r12],r9               @vector load pu1_src
168
169
170    vmull.u8    q15,d2,d25                  @mul_res = vmull_u8(src[0_3], coeffabs_3)@
171    vld1.u32    {q4},[r4],r11               @vector load pu1_src
172    vmlsl.u8    q15,d0,d24                  @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
173    vld1.u32    {q5},[r4],r11               @vector load pu1_src
174    vmlal.u8    q15,d4,d26                  @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
175    vld1.u32    {q6},[r4],r11               @vector load pu1_src
176    vmlsl.u8    q15,d6,d27                  @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
177    vld1.u32    {q7},[r4],r9                @vector load pu1_src
178    vmull.u8    q14,d3,d25
179
180    vmlsl.u8    q14,d1,d24
181
182
183    vmlal.u8    q14,d5,d26
184
185    vmlsl.u8    q14,d7,d27
186
187
188    cmp         r14,#32
189    beq         epilog_end
190    sub         r14,#64
191
192inner_loop_16:
193
194
195
196
197@    bgt            l_2
198
199@   pld         [r12, r2, lsl #1]
200@   pld         [r4, r2, lsl #1]
201
202    pld         [r12, r2, lsl #2]
203    pld         [r4, r2, lsl #2]
204
205    subs        r10,r10,#16
206
207    vmull.u8    q11,d10,d25                 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
208
209
210    addeq       r12,r12,r8
211    addeq       r4,r12,r2
212    vmlsl.u8    q11,d8,d24                  @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
213
214
215
216    vqrshrun.s16 d30,q15,#6
217
218    vld1.u32    {q0},[r12],r11              @vector load pu1_src
219    vqrshrun.s16 d31,q14,#6
220
221
222    vld1.u32    {q1},[r12],r11              @vector load pu1_src
223    vmlal.u8    q11,d12,d26                 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
224
225
226
227
228    vld1.u32    {q2},[r12],r11              @vector load pu1_src
229    vmlsl.u8    q11,d14,d27                 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
230
231
232    vld1.u32    {q3},[r12],r9               @vector load pu1_src
233    vmull.u8    q10,d11,d25                 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
234
235    vmlsl.u8    q10,d9,d24                  @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
236
237    vst1.16     {q15}, [r1],r3
238    vmlal.u8    q10,d13,d26                 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
239
240    vld1.u32    {q4},[r4],r11               @vector load pu1_src
241    vmlsl.u8    q10,d15,d27                 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
242
243
244    vld1.u32    {q5},[r4],r11               @vector load pu1_src
245    vmull.u8    q15,d2,d25                  @mul_res = vmull_u8(src[0_3], coeffabs_3)@
246
247    vld1.u32    {q6},[r4],r11               @vector load pu1_src
248    vmlsl.u8    q15,d0,d24                  @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
249
250    vld1.u32    {q7},[r4],r9                @vector load pu1_src
251    vmlal.u8    q15,d4,d26                  @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
252
253    cmp         r10,#0
254    vqrshrun.s16 d22,q11,#6
255    vqrshrun.s16 d23,q10,#6
256
257
258
259    vmlsl.u8    q15,d6,d27                  @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
260
261    moveq       r10,r5                      @2wd
262    vmull.u8    q14,d3,d25
263
264
265    vst1.16     {q11},[r1],r6               @store the result pu1_dst
266    vmlsl.u8    q14,d1,d24
267
268
269    addeq       r1,r1,r8
270    vmlal.u8    q14,d5,d26
271
272    subs        r14,r14,#32                 @decrement the ht loop
273    vmlsl.u8    q14,d7,d27
274
275@     mov           r0, r7
276
277    bgt         inner_loop_16
278
279
280
281    add         r14,r14,#64
282    cmp         r14,#32
283    beq         epilog_end
284
285epilog:
286    vqrshrun.s16 d30,q15,#6
287    vqrshrun.s16 d31,q14,#6
288
289
290
291    vst1.16     {q15}, [r1],r3
292    vmull.u8    q11,d10,d25                 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
293
294
295
296
297    vmlsl.u8    q11,d8,d24                  @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
298    subs        r10,r10,#16                 @decrement the wd loop
299    vmlal.u8    q11,d12,d26                 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
300    addeq       r12,r12,r8
301    vmlsl.u8    q11,d14,d27                 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
302    moveq       r10,r5                      @2wd
303
304
305    addeq       r4,r12,r2
306    vmull.u8    q10,d11,d25                 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
307    vld1.u32    {q0},[r12],r11              @vector load pu1_src
308    vmlsl.u8    q10,d9,d24                  @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
309    vld1.u32    {q1},[r12],r11              @vector load pu1_src
310    vmlal.u8    q10,d13,d26                 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
311    vld1.u32    {q2},[r12],r11              @vector load pu1_src
312    vmlsl.u8    q10,d15,d27                 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
313    vld1.u32    {q3},[r12],r9               @vector load pu1_src
314    vmull.u8    q15,d2,d25                  @mul_res = vmull_u8(src[0_3], coeffabs_3)@
315
316
317    vld1.u32    {q4},[r4],r11               @vector load pu1_src
318    vmlsl.u8    q15,d0,d24                  @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
319    vld1.u32    {q5},[r4],r11               @vector load pu1_src
320    vmlal.u8    q15,d4,d26                  @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
321
322    vmlsl.u8    q15,d6,d27                  @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
323
324    vld1.u32    {q6},[r4],r11               @vector load pu1_src
325    vmull.u8    q14,d3,d25
326    vld1.u32    {q7},[r4],r9                @vector load pu1_src
327    vmlsl.u8    q14,d1,d24
328    vqrshrun.s16 d22,q11,#6
329    vqrshrun.s16 d23,q10,#6
330
331    vst1.16     {q11},[r1],r6               @store the result pu1_dst
332    vmlal.u8    q14,d5,d26
333
334    vmlsl.u8    q14,d7,d27
335    addeq       r1,r1,r8
336
337
338
339epilog_end:
340    vqrshrun.s16 d30,q15,#6
341    vqrshrun.s16 d31,q14,#6
342
343
344    vmull.u8    q11,d10,d25                 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
345    vmlsl.u8    q11,d8,d24                  @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
346    vmlal.u8    q11,d12,d26                 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
347    vmlsl.u8    q11,d14,d27                 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
348
349
350    vmull.u8    q10,d11,d25                 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
351    vmlsl.u8    q10,d9,d24                  @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
352    vmlal.u8    q10,d13,d26                 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
353    vmlsl.u8    q10,d15,d27                 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
354    vqrshrun.s16 d22,q11,#6
355    vqrshrun.s16 d23,q10,#6
356
357
358    vst1.16     {q15}, [r1],r3
359
360    vst1.16     {q11},[r1]                  @store the result pu1_dst
361
362
363
364    b           end_loops
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384outer_loop_8:
385
386
387    add         r6,r1,r3                    @pu1_dst + dst_strd
388    mov         r7,r5
389    add         r4,r12,r2                   @pu1_src + src_strd
390
391
392inner_loop_8:
393    @vld1.u32  {d0,d1},[r12],r11               @vector load pu1_src
394    vld1.u32    {d0},[r12],r11              @vector load pu1_src
395    vld1.u32    {d1},[r12],r11              @vector load pu1_src
396    vld1.u32    {d2},[r12],r11              @vector load pu1_src
397    vld1.u32    {d3},[r12],r11              @vector load pu1_src
398
399    @vext.u8   d2,d0,d1,#2                     @vector extract of src[0_2]
400    vmull.u8    q4,d1,d25                   @mul_res = vmull_u8(src[0_3], coeffabs_3)@
401    vmlsl.u8    q4,d0,d24                   @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
402    @vext.u8   d4,d0,d1,#4                     @vector extract of src[0_4]
403    @vext.u8   d6,d0,d1,#6                     @vector extract of src[0_6]
404    vmlal.u8    q4,d2,d26                   @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
405    vmlsl.u8    q4,d3,d27                   @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
406
407    vld1.u32    {d4},[r4],r11               @vector load pu1_src
408    vld1.u32    {d5},[r4],r11               @vector load pu1_src
409    vld1.u32    {d6},[r4],r11               @vector load pu1_src
410    vld1.u32    {d7},[r4],r11               @vector load pu1_src
411    @vld1.u32  {d12,d13},[r4],r11              @vector load pu1_src + src_strd
412    @vext.u8   d14,d12,d13,#2                  @vector extract of src[0_2]
413    vmull.u8    q5,d5,d25                   @mul_res = vmull_u8(src[0_3], coeffabs_3)@
414    vmlsl.u8    q5,d4,d24                   @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
415    @vext.u8   d16,d12,d13,#4                  @vector extract of src[0_4]
416    @vext.u8   d18,d12,d13,#6                  @vector extract of src[0_6]
417    vqrshrun.s16 d8,q4,#6                   @right shift and saturating narrow result 1
418    vmlal.u8    q5,d6,d26                   @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
419    vmlsl.u8    q5,d7,d27                   @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
420
421    vst1.8      {d8},[r1]!                  @store the result pu1_dst
422
423    vqrshrun.s16 d10,q5,#6                  @right shift and saturating narrow result 2
424    subs        r7,r7,#8                    @decrement the wd loop
425    vst1.8      {d10},[r6]!                 @store the result pu1_dst
426    bgt         inner_loop_8
427
428    sub         r12,r12,r5
429    subs        r14,r14,#2                  @decrement the ht loop
430    sub         r1,r1,r5
431    add         r12,r12,r2,lsl #1
432    add         r1,r1,r3,lsl #1
433    bgt         outer_loop_8
434    b           end_loops
435
436@height if 4 comes
437outer_loop_ht_4:
438
439    mov         r7,r5
440
441prologue_ht_4:
442
443inner_loop_ht_4:
444
445    mov         r12,r9
446    mov         r4,r1
447
448    sub         r8, r2, #6
449
450    vld1.u32    {d0},[r12],r11              @(1)vector load pu1_src
451    vld1.u32    {d1},[r12],r11              @(1)vector load pu1_src
452    vld1.u32    {d2},[r12],r11              @(1)vector load pu1_src
453    @vld1.u32  {d3},[r12],r2               @(1)vector load pu1_src
454    vld1.u32    {d3},[r12],r8               @(1)vector load pu1_src
455
456    @sub       r12, r12, #6                @(1)
457
458    vld1.u32    {d4},[r12],r11              @(2)vector load pu1_src
459    vld1.u32    {d5},[r12],r11              @(2)vector load pu1_src
460    vld1.u32    {d6},[r12],r11              @(2)vector load pu1_src
461    @vld1.u32  {d7},[r12],r2               @(2)vector load pu1_src
462    vld1.u32    {d7},[r12],r8               @(2)vector load pu1_src
463
464    @sub       r12, r12, #6                @(2)
465
466    vld1.u32    {d14},[r12],r11             @(3)vector load pu1_src
467    vmull.u8    q4,d1,d25                   @(1)mul_res = vmull_u8(src[0_3], coeffabs_3)@
468
469    vld1.u32    {d15},[r12],r11             @(3)vector load pu1_src
470    vmlsl.u8    q4,d0,d24                   @(1)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
471
472    vld1.u32    {d16},[r12],r11             @(3)vector load pu1_src
473    vmlal.u8    q4,d2,d26                   @(1)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
474
475    @vld1.u32  {d17},[r12],r2              @(3)vector load pu1_src
476    vld1.u32    {d17},[r12],r8              @(3)vector load pu1_src
477    vmlsl.u8    q4,d3,d27                   @(1)mul_res = vmlal_u8(src[0_1], coeffabs_1)@
478
479    @sub       r12, r12, #6                @(3)
480    vmull.u8    q5,d5,d25                   @(2)mul_res = vmull_u8(src[0_3], coeffabs_3)@
481
482    vld1.u32    {d18},[r12],r11             @(4)vector load pu1_src
483    vmlsl.u8    q5,d4,d24                   @(2)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
484
485    vld1.u32    {d19},[r12],r11             @(4)vector load pu1_src
486    vmlal.u8    q5,d6,d26                   @(2)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
487
488    vld1.u32    {d20},[r12],r11             @(4)vector load pu1_src
489    vmlsl.u8    q5,d7,d27                   @(2)mul_res = vmlal_u8(src[0_1], coeffabs_1)@
490
491    vld1.u32    {d21},[r12],r2              @(4)vector load pu1_src
492    vqrshrun.s16 d8,q4,#6                   @(1)right shift and saturating narrow result 1
493
494    add         r9,r9,#8                    @(core loop)
495
496    subs        r7,r7,#8                    @(prologue)decrement the wd loop
497    beq         epilogue
498
499core_loop:
500    mov         r12,r9
501
502    vld1.u32    {d0},[r12],r11              @(1_1)vector load pu1_src
503    vmull.u8    q6,d15,d25                  @(3)mul_res = vmull_u8(src[0_3], coeffabs_3)@
504
505    vld1.u32    {d1},[r12],r11              @(1_1)vector load pu1_src
506    vmlsl.u8    q6,d14,d24                  @(3)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
507
508    vld1.u32    {d2},[r12],r11              @(1_1)vector load pu1_src
509    vmlal.u8    q6,d16,d26                  @(3)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
510
511    @vld1.u32  {d3},[r12],r2               @(1_1)vector load pu1_src
512    vld1.u32    {d3},[r12],r8               @(1_1)vector load pu1_src
513    vmlsl.u8    q6,d17,d27                  @(3)mul_res = vmlal_u8(src[0_1], coeffabs_1)@
514
515    @sub       r12, r12, #6                @(1_1)
516
517    vst1.8      {d8},[r4],r3                @(1)store the result pu1_dst
518    vqrshrun.s16 d10,q5,#6                  @(2)right shift and saturating narrow result 2
519
520    vld1.u32    {d4},[r12],r11              @(2_1)vector load pu1_src
521    vmull.u8    q11,d19,d25                 @(4)mul_res = vmull_u8(src[0_3], coeffabs_3)@
522
523    vld1.u32    {d5},[r12],r11              @(2_1)vector load pu1_src
524    vmlsl.u8    q11,d18,d24                 @(4)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
525
526    vld1.u32    {d6},[r12],r11              @(2_1)vector load pu1_src
527    vmlal.u8    q11,d20,d26                 @(4)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
528
529    @vld1.u32  {d7},[r12],r2               @(2_1)vector load pu1_src
530    vld1.u32    {d7},[r12],r8               @(2_1)vector load pu1_src
531    vmlsl.u8    q11,d21,d27                 @(4)mul_res = vmlal_u8(src[0_1], coeffabs_1)@
532
533    @sub       r12, r12, #6                @(2_1)
534
535    vst1.8      {d10},[r4],r3               @(2)store the result pu1_dst
536    vqrshrun.s16 d12,q6,#6                  @(3)right shift and saturating narrow result 1
537
538    vld1.u32    {d14},[r12],r11             @(3_1)vector load pu1_src
539    vmull.u8    q4,d1,d25                   @(1_1)mul_res = vmull_u8(src[0_3], coeffabs_3)@
540
541    vld1.u32    {d15},[r12],r11             @(3_1)vector load pu1_src
542    vmlsl.u8    q4,d0,d24                   @(1_1)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
543
544    vld1.u32    {d16},[r12],r11             @(3_1)vector load pu1_src
545    vmlal.u8    q4,d2,d26                   @(1_1)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
546
547    @vld1.u32  {d17},[r12],r2              @(3_1)vector load pu1_src
548    vld1.u32    {d17},[r12],r8              @(3_1)vector load pu1_src
549    vmlsl.u8    q4,d3,d27                   @(1_1)mul_res = vmlal_u8(src[0_1], coeffabs_1)@
550
551    @sub       r12, r12, #6                @(3_1)
552
553    vst1.8      {d12},[r4],r3               @(3)store the result pu1_dst
554    vqrshrun.s16 d22,q11,#6                 @(4)right shift and saturating narrow result 2
555
556    add         r9,r9,#8                    @(core loop)
557
558    vmull.u8    q5,d5,d25                   @(2_1)mul_res = vmull_u8(src[0_3], coeffabs_3)@
559    vld1.u32    {d18},[r12],r11             @(4_1)vector load pu1_src
560
561    vld1.u32    {d19},[r12],r11             @(4_1)vector load pu1_src
562    vmlsl.u8    q5,d4,d24                   @(2_1)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
563
564    vld1.u32    {d20},[r12],r11             @(4_1)vector load pu1_src
565    vmlal.u8    q5,d6,d26                   @(2_1)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
566
567    vld1.u32    {d21},[r12],r2              @(4_1)vector load pu1_src
568    vmlsl.u8    q5,d7,d27                   @(2_1)mul_res = vmlal_u8(src[0_1], coeffabs_1)@
569
570    add         r1,r1,#8                    @(core loop)
571
572    subs        r7,r7,#8                    @(core loop)
573
574    vst1.8      {d22}, [r4], r3             @(4)store the result pu1_dst
575    vqrshrun.s16 d8,q4,#6                   @(1_1)right shift and saturating narrow result 1
576
577    mov         r4, r1                      @(core loop)
578
579    bgt         core_loop                   @loopback
580
581epilogue:
582    vmull.u8    q6,d15,d25                  @(3)mul_res = vmull_u8(src[0_3], coeffabs_3)@
583
584    vmlsl.u8    q6,d14,d24                  @(3)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
585
586    vmlal.u8    q6,d16,d26                  @(3)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
587
588    vmlsl.u8    q6,d17,d27                  @(3)mul_res = vmlal_u8(src[0_1], coeffabs_1)@
589
590    vst1.8      {d8},[r4],r3                @(1)store the result pu1_dst
591    vqrshrun.s16 d10,q5,#6                  @(2)right shift and saturating narrow result 2
592
593    vmull.u8    q11,d19,d25                 @(4)mul_res = vmull_u8(src[0_3], coeffabs_3)@
594    vmlsl.u8    q11,d18,d24                 @(4)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
595
596    vmlal.u8    q11,d20,d26                 @(4)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
597
598    vmlsl.u8    q11,d21,d27                 @(4)mul_res = vmlal_u8(src[0_1], coeffabs_1)@
599
600    vst1.8      {d10},[r4],r3               @(2)store the result pu1_dst
601    vqrshrun.s16 d12,q6,#6                  @(3)right shift and saturating narrow result 1
602
603    vst1.8      {d12},[r4],r3               @(3)store the result pu1_dst
604
605    add         r1,r1,#8                    @(core loop)
606
607    vqrshrun.s16 d22,q11,#6                 @(4)right shift and saturating narrow result 2
608
609
610    vst1.8      {d22}, [r4], r3             @(4)store the result pu1_dst
611
612    sub         r9,r9,r5
613    subs        r14,r14,#4                  @decrement the ht loop
614    sub         r1,r1,r5
615    add         r9,r9,r2,lsl #2
616    add         r1,r1,r3,lsl #2
617    bgt         outer_loop_ht_4
618    b           end_loops
619
620outer_loop_4:
621    add         r6,r1,r3                    @pu1_dst + dst_strd
622    mov         r7,r5
623    add         r4,r12,r2                   @pu1_src + src_strd
624
625inner_loop_4:
626    @vld1.u32  {d0,d1},[r12]                   @vector load pu1_src
627
628    vld1.u32    {d0},[r12],r11              @vector load pu1_src
629    vld1.u32    {d1},[r12],r11              @vector load pu1_src
630    vld1.u32    {d2},[r12],r11              @vector load pu1_src
631    vld1.u32    {d3},[r12]                  @vector load pu1_src
632
633    sub         r12,r12,#2                  @increment the input pointer
634    vld1.u32    {d4},[r4],r11               @vector load pu1_src
635    vld1.u32    {d5},[r4],r11               @vector load pu1_src
636    vld1.u32    {d6},[r4],r11               @vector load pu1_src
637    vld1.u32    {d7},[r4]                   @vector load pu1_src
638    @vext.u8   d2,d0,d1,#2                     @vector extract of src[0_2]
639    @vext.u8   d4,d0,d1,#4                     @vector extract of src[0_4]
640    @vld1.u32  {d12,d13},[r4]                  @vector load pu1_src + src_strd
641    @vext.u8   d6,d0,d1,#6                     @vector extract of src[0_6]
642
643    sub         r4,r4,#2                    @increment the input pointer
644    @vext.u8   d14,d12,d13,#2                  @vector extract of src[0_2]
645    @vext.u8   d16,d12,d13,#4                  @vector extract of src[0_4]
646    @vext.u8   d18,d12,d13,#6                  @vector extract of src[0_6]
647
648    vzip.32     d0,d4                       @vector zip the i iteration and ii interation in single register
649    vzip.32     d1,d5
650    vzip.32     d2,d6
651    vzip.32     d3,d7
652
653    vmull.u8    q4,d1,d25                   @arithmetic operations for ii iteration in the same time
654    vmlsl.u8    q4,d0,d24
655    vmlal.u8    q4,d2,d26
656    vmlsl.u8    q4,d3,d27
657
658    vqrshrun.s16 d8,q4,#6                   @narrow right shift and saturating the result
659    vst1.32     {d8[0]},[r1]!               @store the i iteration result which is in upper part of the register
660    subs        r7,r7,#4                    @decrement the wd by 4
661
662    vst1.32     {d8[1]},[r6]!               @store the ii iteration result which is in lower part of the register
663
664    bgt         inner_loop_4
665
666    sub         r12,r12,r5
667    subs        r14,r14,#2                  @decrement the ht by 2
668    sub         r1,r1,r5
669    add         r12,r12,r2,lsl #1
670    add         r1,r1,r3,lsl #1
671    bgt         outer_loop_4
672
673end_loops:
674
675    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
676
677
678
679
680
681
682
683
684