1@/*****************************************************************************
2@*
3@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4@*
5@* Licensed under the Apache License, Version 2.0 (the "License");
6@* you may not use this file except in compliance with the License.
7@* You may obtain a copy of the License at:
8@*
9@* http://www.apache.org/licenses/LICENSE-2.0
10@*
11@* Unless required by applicable law or agreed to in writing, software
12@* distributed under the License is distributed on an "AS IS" BASIS,
13@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14@* See the License for the specific language governing permissions and
15@* limitations under the License.
16@*
17@*****************************************************************************/
18@/**
19@*******************************************************************************
20@* @file
21@*  ihevc_intra_pred_filters_planar.s
22@*
23@* @brief
24@*  contains function definitions for inter prediction  interpolation.
25@* functions are coded using neon  intrinsics and can be compiled using
26
27@* rvct
28@*
29@* @author
30@*  akshaya mukund
31@*
32@* @par list of functions:
33@*
34@*
35@* @remarks
36@*  none
37@*
38@*******************************************************************************
39@*/
40@/**
41@*******************************************************************************
42@*
43@* @brief
44@*    luma intraprediction filter for planar input
45@*
46@* @par description:
47@*
48@* @param[in] pu1_ref
49@*  uword8 pointer to the source
50@*
51@* @param[out] pu1_dst
52@*  uword8 pointer to the destination
53@*
54@* @param[in] src_strd
55@*  integer source stride
56@*
57@* @param[in] dst_strd
58@*  integer destination stride
59@*
60@* @param[in] pi1_coeff
61@*  word8 pointer to the planar coefficients
62@*
63@* @param[in] nt
64@*  size of tranform block
65@*
66@* @param[in] mode
67@*  type of filtering
68@*
69@* @returns
70@*
71@* @remarks
72@*  none
73@*
74@*******************************************************************************
75@*/
76
77@void ihevc_intra_pred_luma_planar(uword8* pu1_ref,
78@                                  word32 src_strd,
79@                                  uword8* pu1_dst,
80@                                  word32 dst_strd,
81@                                  word32 nt,
82@                                  word32 mode,
83@                  word32 pi1_coeff)
84@**************variables vs registers*****************************************
85@r0 => *pu1_ref
86@r1 => src_strd
87@r2 => *pu1_dst
88@r3 => dst_strd
89
90@stack contents from #40
91@   nt
92@   mode
93@   pi1_coeff
94
95.text
96.align 4
97
98
99
100
101.globl ihevc_intra_pred_luma_planar_a9q
102.extern gau1_ihevc_planar_factor
103.extern gau1_ihevc_planar_factor_1
104
105gau1_ihevc_planar_factor_addr:
106.long gau1_ihevc_planar_factor - ulbl1 - 8
107
108gau1_ihevc_planar_factor_1_addr:
109.long gau1_ihevc_planar_factor_1 - ulbl2 - 8
110
111
112.type ihevc_intra_pred_luma_planar_a9q, %function
113
114ihevc_intra_pred_luma_planar_a9q:
115
116    stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
117
118    ldr         r4,[sp,#40]                 @loads nt
119    ldr         r11, gau1_ihevc_planar_factor_addr @loads table of coeffs
120ulbl1:
121    add         r11,r11,pc
122
123    clz         r5, r4
124    rsb         r5, r5, #32
125    vdup.16     q7, r5
126    vneg.s16    q7, q7                      @shr value (so vneg)
127    vdup.8      d2, r4                      @nt
128    vdup.s16    q8, r4                      @nt
129
130    sub         r6, r4, #1                  @nt-1
131    add         r6, r6, r0
132    ldr         r7, [r6]
133    vdup.s8     d0, r7                      @src[nt-1]
134
135    add         r6, r4, r4,lsl #1           @3nt
136    add         r6, r6, #1                  @3nt + 1
137    add         r6, r6, r0
138    ldr         r7, [r6]
139    vdup.s8     d1, r7                      @src[3nt+1]
140
141    add         r6, r4, r4                  @2nt
142    add         r14, r6, #1                 @2nt+1
143    sub         r6, r6, #1                  @2nt-1
144    add         r6, r6, r0                  @&src[2nt-1]
145    add         r14, r14, r0                @&src[2nt+1]
146
147    mov         r8, #1                      @row+1 (row is first 0)
148    sub         r9, r4, r8                  @nt-1-row (row is first 0)
149
150    vdup.s8     d5, r8                      @row + 1
151    vdup.s8     d6, r9                      @nt - 1 - row
152    vmov        d7, d5                      @mov #1 to d7 to used for inc for row+1 and dec for nt-1-row
153
154    add         r12, r11, #1                @coeffs (to be reloaded after every row)
155    mov         r1, r4                      @nt (row counter) (dec after every row)
156    mov         r5, r2                      @dst (to be reloaded after every row and inc by dst_strd)
157    mov         r10, #8                     @increment for the coeffs
158    mov         r0, r14                     @&src[2nt+1] (to be reloaded after every row)
159
160    cmp         r4, #4
161    beq         tf_sz_4
162
163@@ ========== ***************** =====================
164prolog:
165tf_sz_8_16_32:
166
167    mov         r7, r4                      @column counter (set to no of cols)
168    mov         r9, r4, lsr #3              @divide nt by 8
169    mul         r7, r7, r9                  @multiply width * height
170    ldr         r5, gau1_ihevc_planar_factor_1_addr @loads table of coeffs
171ulbl2:
172    add         r5,r5,pc
173    sub         r6, r6, #7
174    mov         r8, r2
175    lsl         r9, r3, #3                  @4*stride
176    rsb         r9, r9, #8                  @8-4*stride
177    mov         r10, r4                     @nt
178    sub         r10, r10, #8                @nt - 8
179
180col_loop_8_16_32:
181
182    vld1.s8     d8, [r12]                   @(1-8)load 8 coeffs [col+1]
183    vdup.16     q6, r4                      @(1)
184    vld1.s8     d4, [r6]                    @(1-8)src[2nt-1-row]
185    vsub.s8     d9, d2, d8                  @(1-8)[nt-1-col]
186
187
188    vmlal.u8    q6, d5, d0                  @(1)(row+1) *   src[nt-1]
189
190    vld1.s8     d3, [r14]                   @(1-8)load 8 src[2nt+1+col]
191    vmlal.u8    q6, d8, d1                  @(1)(col+1) *   src[3nt+1]
192
193    vdup.s8     d20, d4[7]                  @(1)
194    vmlal.u8    q6, d6, d3                  @(1)(nt-1-row)  *   src[2nt+1+col]
195
196    vdup.s8     d21, d4[6]                  @(2)
197    vmlal.u8    q6, d9, d20                 @(1)(nt-1-col)  *   src[2nt-1-row]
198
199    vdup.16     q15, r4                     @(2)
200    vadd.s8     d5, d5, d7                  @(1)
201
202    vsub.s8     d6, d6, d7                  @(1)
203
204    vdup.s8     d22, d4[5]                  @(3)
205    vmlal.u8    q15, d5, d0                 @(2)
206
207    vdup.16     q14, r4                     @(3)
208    vmlal.u8    q15, d8, d1                 @(2)
209
210    vmlal.u8    q15, d6, d3                 @(2)
211    vmlal.u8    q15, d9, d21                @(2)
212
213    vshl.s16    q6, q6, q7                  @(1)shr
214
215    vadd.s8     d5, d5, d7                  @(2)
216    vsub.s8     d6, d6, d7                  @(2)
217
218    vmovn.i16   d12, q6                     @(1)
219    vmlal.u8    q14, d5, d0                 @(3)
220
221    vdup.8      d23, d4[4]                  @(4)
222    vmlal.u8    q14, d8, d1                 @(3)
223
224    vdup.16     q5, r4                      @(4)
225    vmlal.u8    q14, d6, d3                 @(3)
226
227    vst1.s8     d12, [r2], r3               @(1)str 8 values
228    vmlal.u8    q14, d9, d22                @(3)
229
230    vshl.s16    q15, q15, q7                @(2)shr
231
232    vadd.s8     d5, d5, d7                  @(3)
233    vsub.s8     d6, d6, d7                  @(3)
234
235    vmovn.i16   d30, q15                    @(2)
236    vmlal.u8    q5, d5, d0                  @(4)
237
238    vdup.8      d20, d4[3]                  @(5)
239    vmlal.u8    q5, d8, d1                  @(4)
240
241    vdup.16     q8, r4                      @(5)
242    vmlal.u8    q5, d6, d3                  @(4)
243
244    vst1.s8     d30, [r2], r3               @(2)str 8 values
245    vmlal.u8    q5, d9, d23                 @(4)
246
247    vshl.s16    q14, q14, q7                @(3)shr
248
249    vadd.s8     d5, d5, d7                  @(4)
250    vsub.s8     d6, d6, d7                  @(4)
251
252    vmovn.i16   d28, q14                    @(3)
253    vmlal.u8    q8, d5, d0                  @(5)
254
255    vdup.8      d21, d4[2]                  @(6)
256    vmlal.u8    q8, d8, d1                  @(5)
257
258    vdup.16     q9, r4                      @(6)
259    vmlal.u8    q8, d6, d3                  @(5)
260
261    vst1.s8     d28, [r2], r3               @(3)str 8 values
262    vmlal.u8    q8, d9, d20                 @(5)
263
264    vshl.s16    q5, q5, q7                  @(4)shr
265    vadd.s8     d5, d5, d7                  @(5)
266    vsub.s8     d6, d6, d7                  @(5)
267
268    vmovn.i16   d10, q5                     @(4)
269    vmlal.u8    q9, d5, d0                  @(6)
270
271    vdup.8      d22, d4[1]                  @(7)
272    vmlal.u8    q9, d8, d1                  @(6)
273
274    vdup.16     q13, r4                     @(7)
275    vmlal.u8    q9, d6, d3                  @(6)
276
277    vst1.s8     d10, [r2], r3               @(4)str 8 values
278    vmlal.u8    q9, d9, d21                 @(6)
279
280    vshl.s16    q8, q8, q7                  @(5)shr
281
282    vadd.s8     d5, d5, d7                  @(6)
283    vsub.s8     d6, d6, d7                  @(6)
284
285    vmovn.i16   d16, q8                     @(5)
286    vmlal.u8    q13, d5, d0                 @(7)
287
288    vdup.8      d23, d4[0]                  @(8)
289    vmlal.u8    q13, d8, d1                 @(7)
290
291    vdup.16     q12, r4                     @(8)
292    vmlal.u8    q13, d6, d3                 @(7)
293
294    vst1.s8     d16, [r2], r3               @(5)str 8 values
295    vmlal.u8    q13, d9, d22                @(7)
296
297    vshl.s16    q9, q9, q7                  @(6)shr
298
299    vadd.s8     d5, d5, d7                  @(7)
300    vsub.s8     d6, d6, d7                  @(7)
301
302    vmovn.i16   d18, q9                     @(6)
303    vmlal.u8    q12, d5, d0                 @(8)
304
305
306    vmlal.u8    q12, d8, d1                 @(8)
307
308    vmlal.u8    q12, d6, d3                 @(8)
309
310    vst1.s8     d18, [r2], r3               @(6)str 8 values
311    vmlal.u8    q12, d9, d23                @(8)
312
313    vshl.s16    q13, q13, q7                @(7)shr
314
315    subs        r7, r7, #8
316
317    beq         epilog
318
319    subs        r1, r1, #8                  @row counter
320    addgt       r12, r12, #8                @col inc
321    addgt       r14, r14, #8                @also for col inc
322    movle       r1, r4                      @nt reloaded (refresh the value)
323    addle       r12, r11, #1                @r12 reset
324
325    movle       r14, r0                     @r14 reset
326    vld1.s8     d8, [r12]                   @(1n)(1-8)load 8 coeffs [col+1]
327
328    suble       r6, r6, #8                  @for next set of rows
329    vld1.s8     d3, [r14]                   @(1n)(1-8)load 8 src[2nt+1+col]
330
331    addle       r5, r5, #8
332    vdup.16     q6, r4                      @(1n)(1)
333
334    vld1.s8     d5, [r5]
335
336    vld1.s8     d4, [r6]                    @(1n)(1-8)src[2nt-1-row]
337    vsub.s8     d9, d2, d8                  @(1n)(1-8)[nt-1-col]
338
339    vdup.s8     d20, d4[7]                  @(1n)(1)
340    vsub.s8     d6, d2, d5
341
342    beq         epilog
343
344kernel_plnr:
345
346    cmp         r1, #0                      @ (cond loop)
347    vshl.s16    q12, q12, q7                @(8)shr
348
349    vmovn.i16   d26, q13                    @(7)
350    vmlal.u8    q6, d5, d0                  @(1)(row+1) *   src[nt-1]
351
352    vmovn.i16   d24, q12                    @(8)
353    vmlal.u8    q6, d8, d1                  @(1)(col+1) *   src[3nt+1]
354
355    vdup.s8     d21, d4[6]                  @(2)
356    vmlal.u8    q6, d6, d3                  @(1)(nt-1-row)  *   src[2nt+1+col]
357
358    vdup.16     q15, r4                     @(2)
359    vmlal.u8    q6, d9, d20                 @(1)(nt-1-col)  *   src[2nt-1-row]
360
361    vst1.s8     d26, [r2], r3               @(7)str 8 values
362    vadd.s8     d5, d5, d7                  @(1)
363
364    vst1.s8     d24, [r2], r3               @(8)str 8 values
365    vsub.s8     d6, d6, d7                  @(1)
366
367    addgt       r2, r2, r9                  @since more cols to fill, dst + 8 - 6*strd (cond loop)
368    vmlal.u8    q15, d5, d0                 @(2)
369
370    suble       r2, r2, r10                 @else go to next set of rows, dst - (nt-8) (cond loop)
371    vmlal.u8    q15, d8, d1                 @(2)
372
373    vdup.s8     d22, d4[5]                  @(3)
374    vmlal.u8    q15, d6, d3                 @(2)
375
376    vdup.16     q14, r4                     @(3)
377    vmlal.u8    q15, d9, d21                @(2)
378
379    vshl.s16    q6, q6, q7                  @(1)shr
380
381    vadd.s8     d5, d5, d7                  @(2)
382    movle       r1, r4                      @nt reloaded (refresh the value)    (cond loop)
383
384    vsub.s8     d6, d6, d7                  @(2)
385    subs        r1, r1, #8                  @row counter (loop)
386
387    vmovn.i16   d12, q6                     @(1)
388    vmlal.u8    q14, d5, d0                 @(3)
389
390    vdup.8      d23, d4[4]                  @(4)
391    vmlal.u8    q14, d8, d1                 @(3)
392
393    vdup.16     q5, r4                      @(4)
394    vmlal.u8    q14, d6, d3                 @(3)
395
396    vst1.s8     d12, [r2], r3               @(1)str 8 values
397    vmlal.u8    q14, d9, d22                @(3)
398
399    vshl.s16    q15, q15, q7                @(2)shr
400
401    vadd.s8     d5, d5, d7                  @(3)
402
403    vsub.s8     d6, d6, d7                  @(3)
404
405    vmovn.i16   d30, q15                    @(2)
406    vmlal.u8    q5, d5, d0                  @(4)
407
408    vdup.8      d20, d4[3]                  @(5)
409    vmlal.u8    q5, d8, d1                  @(4)
410
411    vdup.16     q8, r4                      @(5)
412    vmlal.u8    q5, d6, d3                  @(4)
413
414    vst1.s8     d30, [r2], r3               @(2)str 8 values
415    vmlal.u8    q5, d9, d23                 @(4)
416
417    vshl.s16    q14, q14, q7                @(3)shr
418
419    vadd.s8     d5, d5, d7                  @(4)
420
421    vsub.s8     d6, d6, d7                  @(4)
422
423    vmovn.i16   d28, q14                    @(3)
424    vmlal.u8    q8, d5, d0                  @(5)
425
426    vdup.8      d21, d4[2]                  @(6)
427    vmlal.u8    q8, d8, d1                  @(5)
428
429    vdup.16     q9, r4                      @(6)
430    vmlal.u8    q8, d6, d3                  @(5)
431
432    vst1.s8     d28, [r2], r3               @(3)str 8 values
433    vmlal.u8    q8, d9, d20                 @(5)
434
435    addle       r12, r11, #1                @r12 reset (cond loop)
436    vshl.s16    q5, q5, q7                  @(4)shr
437
438    addgt       r12, r12, #8                @col inc (cond loop)
439    vadd.s8     d5, d5, d7                  @(5)
440
441    addgt       r14, r14, #8                @also for col inc (cond loop)
442    vsub.s8     d6, d6, d7                  @(5)
443
444    vmovn.i16   d10, q5                     @(4)
445    vmlal.u8    q9, d5, d0                  @(6)
446
447    vdup.8      d22, d4[1]                  @(7)
448    vmlal.u8    q9, d8, d1                  @(6)
449
450    vdup.16     q13, r4                     @(7)
451    vmlal.u8    q9, d6, d3                  @(6)
452
453    vst1.s8     d10, [r2], r3               @(4)str 8 values
454    vmlal.u8    q9, d9, d21                 @(6)
455
456    movle       r14, r0                     @r14 reset (cond loop)
457    vshl.s16    q8, q8, q7                  @(5)shr
458
459    suble       r6, r6, #8                  @for next set of rows (cond loop)
460    vadd.s8     d5, d5, d7                  @(6)
461
462    addle       r5, r5, #8                  @ (cond loop)
463    vsub.s8     d6, d6, d7                  @(6)
464
465    vmovn.i16   d16, q8                     @(5)
466    vmlal.u8    q13, d5, d0                 @(7)
467
468    vdup.8      d23, d4[0]                  @(8)
469    vmlal.u8    q13, d8, d1                 @(7)
470
471    vdup.16     q12, r4                     @(8)
472    vmlal.u8    q13, d6, d3                 @(7)
473
474    vst1.s8     d16, [r2], r3               @(5)str 8 values
475    vmlal.u8    q13, d9, d22                @(7)
476
477    vld1.s8     d4, [r6]                    @(1n)(1-8)src[2nt-1-row]
478    vshl.s16    q9, q9, q7                  @(6)shr
479
480    vadd.s8     d5, d5, d7                  @(7)
481
482    vsub.s8     d6, d6, d7                  @(7)
483
484    vmovn.i16   d18, q9                     @(6)
485    vmlal.u8    q12, d5, d0                 @(8)
486
487    vld1.s8     d5, [r5]                    @(row+1 value)
488    vmlal.u8    q12, d8, d1                 @(8)
489
490    vdup.s8     d20, d4[7]                  @(1n)(1)
491    vmlal.u8    q12, d6, d3                 @(8)
492
493    vst1.s8     d18, [r2], r3               @(6)str 8 values
494    vmlal.u8    q12, d9, d23                @(8)
495
496    vld1.s8     d8, [r12]                   @(1n)(1-8)load 8 coeffs [col+1]
497    vsub.s8     d6, d2, d5                  @(nt-1-row) value
498
499    subs        r7, r7, #8                  @col counter
500
501    vld1.s8     d3, [r14]                   @(1n)(1-8)load 8 src[2nt+1+col]
502    vshl.s16    q13, q13, q7                @(7)shr
503
504    vdup.16     q6, r4                      @(1n)(1)
505    vsub.s8     d9, d2, d8                  @(1n)(1-8)[nt-1-col]
506
507    bne         kernel_plnr
508
509epilog:
510
511    vmovn.i16   d26, q13                    @(7)
512    vst1.s8     d26, [r2], r3               @(7)str 8 values
513
514    vshl.s16    q12, q12, q7                @(8)shr
515    vmovn.i16   d24, q12                    @(8)
516    vst1.s8     d24, [r2], r3               @(8)str 8 values
517
518@@ ========== ***************** =====================
519
520    beq         end_loop
521
522tf_sz_4:
523    vld1.s8     d10, [r14]                  @load src[2nt+1+col]
524    vld1.s8     d8, [r12], r10              @load 8 coeffs [col+1]
525loop_sz_4:
526    mov         r10, #4                     @reduce inc to #4 for 4x4
527    ldr         r7, [r6], #-1               @src[2nt-1-row] (dec to take into account row)
528    vdup.s8     d4, r7                      @src[2nt-1-row]
529
530    vsub.s8     d9, d2, d8                  @[nt-1-col]
531
532    vmull.u8    q6, d5, d0                  @(row+1)    *   src[nt-1]
533    vmlal.u8    q6, d6, d10                 @(nt-1-row) *   src[2nt+1+col]
534    vmlal.u8    q6, d8, d1                  @(col+1)    *   src[3nt+1]
535    vmlal.u8    q6, d9, d4                  @(nt-1-col) *   src[2nt-1-row]
536@   vadd.i16    q6, q6, q8          @add (nt)
537@   vshl.s16    q6, q6, q7          @shr
538@   vmovn.i16   d12, q6
539    vrshrn.s16  d12,q6,#3
540    vst1.s32    {d12[0]}, [r2], r3
541
542    vadd.s8     d5, d5, d7                  @row++ [(row+1)++]
543    vsub.s8     d6, d6, d7                  @[nt-1-row]--
544    subs        r1, r1, #1
545
546    bne         loop_sz_4
547
548end_loop:
549    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
550
551
552
553
554
555
556
557
558