1@/*****************************************************************************
2@*
3@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4@*
5@* Licensed under the Apache License, Version 2.0 (the "License");
6@* you may not use this file except in compliance with the License.
7@* You may obtain a copy of the License at:
8@*
9@* http://www.apache.org/licenses/LICENSE-2.0
10@*
11@* Unless required by applicable law or agreed to in writing, software
12@* distributed under the License is distributed on an "AS IS" BASIS,
13@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14@* See the License for the specific language governing permissions and
15@* limitations under the License.
16@*
17@*****************************************************************************/
18@/**
19@*******************************************************************************
20@* @file
21@*  ihevc_intra_pred_chroma_mode_3_to_9.s
22@*
23@* @brief
24@*  contains function definitions for intra prediction dc filtering.
25@* functions are coded using neon  intrinsics and can be compiled using
26
27@* rvct
28@*
29@* @author
30@*  parthiban v
31@*
32@* @par list of functions:
33@*
34@*
35@* @remarks
36@*  none
37@*
38@*******************************************************************************
39@*/
40@/**
41@*******************************************************************************
42@*
43@* @brief
44@*    luma intraprediction filter for dc input
45@*
46@* @par description:
47@*
48@* @param[in] pu1_ref
49@*  uword8 pointer to the source
50@*
51@* @param[out] pu1_dst
52@*  uword8 pointer to the destination
53@*
54@* @param[in] src_strd
55@*  integer source stride
56@*
57@* @param[in] dst_strd
58@*  integer destination stride
59@*
60@* @param[in] nt
61@*  size of tranform block
62@*
63@* @param[in] mode
64@*  type of filtering
65@*
66@* @returns
67@*
68@* @remarks
69@*  none
70@*
71@*******************************************************************************
72@*/
73@void ihevc_intra_pred_chroma_mode_3_to_9(uword8 *pu1_ref,
74@                                       word32 src_strd,
75@                                       uword8 *pu1_dst,
76@                                       word32 dst_strd,
77@                                       word32 nt,
78@                                       word32 mode)
79@**************variables vs registers*****************************************
80@r0 => *pu1_ref
81@r1 => src_strd
82@r2 => *pu1_dst
83@r3 => dst_strd
84
85@stack contents from #40
86@   nt
87@   mode
88
89.text
90.align 4
91
92
93
94
95
96.globl ihevc_intra_pred_chroma_mode_3_to_9_a9q
97.extern gai4_ihevc_ang_table
98.extern gai4_ihevc_inv_ang_table
99.extern col_for_intra_chroma
100.extern idx_neg_idx_chroma_3_9
101
102gai4_ihevc_ang_table_addr:
103.long gai4_ihevc_ang_table - ulbl1 - 8
104
105gai4_ihevc_inv_ang_table_addr:
106.long gai4_ihevc_inv_ang_table - ulbl2 - 8
107
108
109idx_neg_idx_chroma_3_9_addr:
110.long idx_neg_idx_chroma_3_9 - ulbl3 - 8
111
112col_for_intra_chroma_addr_1:
113.long col_for_intra_chroma - ulbl4 - 8
114
115col_for_intra_chroma_addr_2:
116.long col_for_intra_chroma - ulbl5 - 8
117
118col_for_intra_chroma_addr_3:
119.long col_for_intra_chroma - ulbl6 - 8
120
121.type ihevc_intra_pred_chroma_mode_3_to_9_a9q, %function
122
123ihevc_intra_pred_chroma_mode_3_to_9_a9q:
124
125    stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
126
127    ldr         r4,[sp,#40]                 @loads nt
128    ldr         r7, gai4_ihevc_ang_table_addr
129ulbl1:
130    add         r7,r7,pc
131
132    ldr         r5,[sp,#44]                 @mode (3 to 9)
133    ldr         r8, gai4_ihevc_inv_ang_table_addr
134ulbl2:
135    add         r8,r8,pc
136
137    add         r7, r7, r5, lsl #2          @gai4_ihevc_ang_table[mode]
138    ldr         r7, [r7]                    @intra_pred_ang
139    vdup.8      d30, r7                     @intra_pred_ang
140
141    ldr         r14, col_for_intra_chroma_addr_1
142ulbl4:
143    add         r14,r14,pc
144
145prologue_8_16_32:
146    lsr         r10, r4, #3
147    vld1.8      d31, [r14]!
148    mul         r10, r4, r10                @block counter (dec by #8)
149
150    mov         r11, r4, lsl #1             @col counter to be inc/dec by #8
151    vmull.s8    q11, d30, d31               @(col+1)*intra_pred_angle [0:7](col)
152
153    sub         r7, r5, #3
154    ldr         r12, idx_neg_idx_chroma_3_9_addr @load most idx table
155ulbl3:
156    add         r12,r12,pc
157
158    add         r12, r12, r7, lsl #4
159    mov         r8, r12
160
161    mov         r7, #8
162    sub         r7, r7, r3, lsl #3          @r7 = 8-8r3
163
164    ldr         r9, [r8]
165    mov         r9, r9, lsl #1
166    add         r1, r0, r4, lsl #2          @pu1_ref + 4*nt
167
168    vmovn.s16   d6, q11
169    vdup.8      d26, r9                     @most idx added to final idx values
170    sub         r1, r1, #26                 @ref_main_idx + 2nt - (8 + 1)(two_nt - idx - row ) for 8 & 8 - 1row
171
172    sub         r6, r1, r9
173
174    vld1.8      {d0,d1,d2,d3}, [r6]         @stores the 32 values reqd based on indices values (from most idx)
175    vshr.s16    q11, q11, #5
176
177    vmov.i8     d29, #31                    @contains #31 for vand operation
178
179    vmov.i8     d28, #32
180
181    vqmovn.s16  d8, q11
182    vshl.s8     d8, d8, #1                  @ 2 * idx
183
184    vand        d6, d6, d29                 @fract values in d1/ idx values in d0
185    vmov.i8     d29, #2                     @contains #2 for adding to get ref_main_idx + 1
186
187    movw        r0,#0x302                   @ idx value for v is +1 of u
188    vdup.u16    d27,r0
189    mov         r0,#0
190
191    vmov.i8     d9, #22                     @row 0 to 7
192
193    vsub.s8     d8, d8, d27                 @ref_main_idx (sub row)
194    vsub.s8     d8, d26, d8                 @ref_main_idx (row 0)
195    vadd.s8     d8, d8, d9                  @to compensate the pu1_src idx incremented by 8
196    vsub.s8     d9, d8, d29                 @ref_main_idx + 1 (row 0)
197    vtbl.8      d12, {d0,d1,d2,d3}, d8      @load from ref_main_idx (row 0)
198    vsub.s8     d7, d28, d6                 @32-fract
199
200    vtbl.8      d13, {d0,d1,d2,d3}, d9      @load from ref_main_idx + 1 (row 0)
201    vsub.s8     d4, d8, d29                 @ref_main_idx (row 1)
202    vsub.s8     d5, d9, d29                 @ref_main_idx + 1 (row 1)
203
204    vmov.i8     d29, #4
205
206    vtbl.8      d16, {d0,d1,d2,d3}, d4      @load from ref_main_idx (row 1)
207    vmull.u8    q12, d12, d7                @mul (row 0)
208    vmlal.u8    q12, d13, d6                @mul (row 0)
209
210    vtbl.8      d17, {d0,d1,d2,d3}, d5      @load from ref_main_idx + 1 (row 1)
211    vsub.s8     d8, d8, d29                 @ref_main_idx (row 2)
212    vsub.s8     d9, d9, d29                 @ref_main_idx + 1 (row 2)
213
214    vrshrn.i16  d24, q12, #5                @round shft (row 0)
215
216    vtbl.8      d14, {d0,d1,d2,d3}, d8      @load from ref_main_idx (row 2)
217    vmull.u8    q11, d16, d7                @mul (row 1)
218    vmlal.u8    q11, d17, d6                @mul (row 1)
219
220    vtbl.8      d15, {d0,d1,d2,d3}, d9      @load from ref_main_idx + 1 (row 2)
221    vsub.s8     d4, d4, d29                 @ref_main_idx (row 3)
222    vsub.s8     d5, d5, d29                 @ref_main_idx + 1 (row 3)
223
224    vst1.8      d24, [r2], r3               @st (row 0)
225    vrshrn.i16  d22, q11, #5                @round shft (row 1)
226
227    vtbl.8      d10, {d0,d1,d2,d3}, d4      @load from ref_main_idx (row 3)
228    vmull.u8    q10, d14, d7                @mul (row 2)
229    vmlal.u8    q10, d15, d6                @mul (row 2)
230
231    vtbl.8      d11, {d0,d1,d2,d3}, d5      @load from ref_main_idx + 1 (row 3)
232    vsub.s8     d8, d8, d29                 @ref_main_idx (row 4)
233    vsub.s8     d9, d9, d29                 @ref_main_idx + 1 (row 4)
234
235    vst1.8      d22, [r2], r3               @st (row 1)
236    vrshrn.i16  d20, q10, #5                @round shft (row 2)
237
238    vtbl.8      d12, {d0,d1,d2,d3}, d8      @load from ref_main_idx (row 4)
239    vmull.u8    q9, d10, d7                 @mul (row 3)
240    vmlal.u8    q9, d11, d6                 @mul (row 3)
241
242    vtbl.8      d13, {d0,d1,d2,d3}, d9      @load from ref_main_idx + 1 (row 4)
243    vsub.s8     d4, d4, d29                 @ref_main_idx (row 5)
244    vsub.s8     d5, d5, d29                 @ref_main_idx + 1 (row 5)
245
246    vst1.8      d20, [r2], r3               @st (row 2)
247    vrshrn.i16  d18, q9, #5                 @round shft (row 3)
248
249    vtbl.8      d16, {d0,d1,d2,d3}, d4      @load from ref_main_idx (row 5)
250    vmull.u8    q12, d12, d7                @mul (row 4)
251    vmlal.u8    q12, d13, d6                @mul (row 4)
252
253    vtbl.8      d17, {d0,d1,d2,d3}, d5      @load from ref_main_idx + 1 (row 5)
254    vsub.s8     d8, d8, d29                 @ref_main_idx (row 6)
255    vsub.s8     d9, d9, d29                 @ref_main_idx + 1 (row 6)
256
257    vst1.8      d18, [r2], r3               @st (row 3)
258    cmp         r4,#4
259    beq         end_func
260    vrshrn.i16  d24, q12, #5                @round shft (row 4)
261
262    vtbl.8      d14, {d0,d1,d2,d3}, d8      @load from ref_main_idx (row 6)
263    vmull.u8    q11, d16, d7                @mul (row 5)
264    vmlal.u8    q11, d17, d6                @mul (row 5)
265
266    vtbl.8      d15, {d0,d1,d2,d3}, d9      @load from ref_main_idx + 1 (row 6)
267    vsub.s8     d4, d4, d29                 @ref_main_idx (row 7)
268    vsub.s8     d5, d5, d29                 @ref_main_idx + 1 (row 7)
269
270    vst1.8      d24, [r2], r3               @st (row 4)
271    vrshrn.i16  d22, q11, #5                @round shft (row 5)
272
273    vtbl.8      d10, {d0,d1,d2,d3}, d4      @load from ref_main_idx (row 7)
274    vmull.u8    q10, d14, d7                @mul (row 6)
275    vmlal.u8    q10, d15, d6                @mul (row 6)
276
277    vtbl.8      d11, {d0,d1,d2,d3}, d5      @load from ref_main_idx + 1 (row 7)
278    vmull.u8    q9, d10, d7                 @mul (row 7)
279    vmlal.u8    q9, d11, d6                 @mul (row 7)
280
281    vst1.8      d22, [r2], r3               @st (row 5)
282    vrshrn.i16  d20, q10, #5                @round shft (row 6)
283    vrshrn.i16  d18, q9, #5                 @round shft (row 7)
284
285    vst1.8      d20, [r2], r3               @st (row 6)
286
287    subs        r10, r10, #4                @subtract 8 and go to end if 8x8
288
289    vst1.8      d18, [r2], r3               @st (row 7)
290
291    beq         end_func
292
293    subs        r11, r11, #8                @decrement the processed col
294    addgt       r8, r8, #4
295    addgt       r2, r2, r7
296    movle       r8, r12
297    suble       r2, r2, r4
298    addle       r2, r2, #8
299    movle       r11, r4, lsl #1
300    ldrle       r14, col_for_intra_chroma_addr_2
301ulbl5:
302    addle       r14,r14,pc
303    addle       r0, r0, #8
304
305    vld1.8      d31, [r14]!
306    vmull.s8    q6, d30, d31                @(col+1)*intra_pred_angle [0:7](col)
307    vmovn.s16   d10, q6
308    vshr.s16    q6, q6, #5
309    vqmovn.s16  d11, q6
310    vshl.s8     d11, d11, #1
311    movw        r5, #0x302                  @idx value for v is +1 of u
312    vdup.u16    d27, r5                     @row value inc or reset accordingly
313    ldr         r9, [r8]                    @loads index value
314    mov         r9, r9, lsl #1
315    mov         r5, #22
316    sub         r5, r5, r0, lsl #1
317    vdup.8      d16, r5
318    vdup.8      d26, r9
319
320    mov         r5,r2
321    vsub.s8     d11, d11, d27               @ref_main_idx (sub row)
322
323kernel_8_16_32:
324    vmov.i8     d29, #2                     @contains #2 for adding to get ref_main_idx + 1
325    vsub.s8     d8, d26, d11                @ref_main_idx
326    vmov        d26,d10
327
328    subs        r11, r11, #8
329    sub         r6, r1, r9
330    vtbl.8      d10, {d0,d1,d2,d3}, d4      @load from ref_main_idx (row 7)
331    vadd.s8     d8, d8, d16                 @to compensate the pu1_src idx incremented by 8
332
333    vmull.u8    q10, d14, d7                @mul (row 6)
334    vtbl.8      d11, {d0,d1,d2,d3}, d5      @load from ref_main_idx - 1 (row 7)
335    vmlal.u8    q10, d15, d6                @mul (row 6)
336
337    addle       r0, r0, #8
338    vsub.s8     d9, d8, d29                 @ref_main_idx - 2
339    addgt       r8, r8, #4
340
341    vld1.8      {d0,d1,d2,d3}, [r6]         @stores the 32 values reqd based on indices values (from most idx)
342    vrshrn.i16  d22, q11, #5                @round shft (row 5)
343
344    ldrle       r14, col_for_intra_chroma_addr_3
345ulbl6:
346    addle       r14,r14,pc
347    vst1.8      d24, [r5], r3               @st (row 4)
348    movle       r8, r12
349
350    movw        r9,#0x302
351    vdup.16     d27, r9                     @row value inc or reset accordingly
352    vsub.s8     d4, d8, d29                 @ref_main_idx (row 1)
353
354    vsub.s8     d5, d9, d29                 @ref_main_idx - 1 (row 1)
355    vtbl.8      d12, {d0,d1,d2,d3}, d8      @load from ref_main_idx (row 0)
356    vmov.i8     d29, #31                    @contains #2 for adding to get ref_main_idx + 1
357
358    vmull.u8    q9, d10, d7                 @mul (row 7)
359    vtbl.8      d13, {d0,d1,d2,d3}, d9      @load from ref_main_idx + 1 (row 0)
360    vmlal.u8    q9, d11, d6                 @mul (row 7)
361
362    vld1.8      d31, [r14]!
363    vand        d6, d29, d26                @fract values in d1/ idx values in d0
364
365    movle       r11, r4, lsl #1
366    vmov.i8     d29, #4                     @contains #2 for adding to get ref_main_idx + 1
367    ldr         r9, [r8]
368
369    vst1.8      d22, [r5], r3               @(from previous loop)st (row 5)
370    vrshrn.i16  d20, q10, #5                @(from previous loop)round shft (row 6)
371
372    vsub.s8     d8, d8, d29                 @ref_main_idx (row 2)
373    vtbl.8      d10, {d0,d1,d2,d3}, d4      @load from ref_main_idx (row 1)
374    vsub.s8     d9, d9, d29                 @ref_main_idx - 1 (row 2)
375
376    mov         r9,r9,lsl #1
377    vsub.s8     d7, d28, d6                 @32-fract
378
379    vmull.u8    q12, d12, d7                @mul (row 0)
380    vtbl.8      d17, {d0,d1,d2,d3}, d5      @load from ref_main_idx + 1 (row 1)
381    vmlal.u8    q12, d13, d6                @mul (row 0)
382
383    vst1.8      d20, [r5], r3               @(from previous loop)st (row 6)
384    vrshrn.i16  d18, q9, #5                 @(from previous loop)round shft (row 7)
385
386    vsub.s8     d4, d4, d29                 @ref_main_idx (row 3)
387    vtbl.8      d14, {d0,d1,d2,d3}, d8      @load from ref_main_idx (row 2)
388    vsub.s8     d5, d5, d29                 @ref_main_idx - 1 (row 3)
389
390    vmull.u8    q11, d10, d7                @mul (row 1)
391    vtbl.8      d15, {d0,d1,d2,d3}, d9      @load from ref_main_idx + 1 (row 2)
392    vmlal.u8    q11, d17, d6                @mul (row 1)
393
394    vrshrn.i16  d24, q12, #5                @round shft (row 0)
395    vst1.8      d18, [r5], r3               @(from previous loop)st (row 7)
396
397    vsub.s8     d8, d8, d29                 @ref_main_idx (row 4)
398    vtbl.8      d10, {d0,d1,d2,d3}, d4      @load from ref_main_idx (row 3)
399    vsub.s8     d9, d9, d29                 @ref_main_idx - 1 (row 4)
400
401    vmull.u8    q10, d14, d7                @mul (row 2)
402    vtbl.8      d11, {d0,d1,d2,d3}, d5      @load from ref_main_idx + 1 (row 3)
403    vmlal.u8    q10, d15, d6                @mul (row 2)
404
405    add         r5,r2,r3,lsl#2
406    vmull.s8    q7, d30, d31                @(col+1)*intra_pred_angle [0:7](col)
407    add         r9, r9, r0, lsl #1
408
409    vst1.8      d24, [r2], r3               @st (row 0)
410    vrshrn.i16  d22, q11, #5                @round shft (row 1)
411
412    vsub.s8     d4, d4, d29                 @ref_main_idx (row 5)
413    vtbl.8      d12, {d0,d1,d2,d3}, d8      @load from ref_main_idx (row 4)
414    vsub.s8     d5, d5, d29                 @ref_main_idx - 1 (row 5)
415
416    vmull.u8    q9, d10, d7                 @mul (row 3)
417    vtbl.8      d13, {d0,d1,d2,d3}, d9      @load from ref_main_idx + 1 (row 4)
418    vmlal.u8    q9, d11, d6                 @mul (row 3)
419
420    vst1.8      d22, [r2], r3               @st (row 1)
421    vrshrn.i16  d20, q10, #5                @round shft (row 2)
422
423    vmovn.s16   d10, q7
424    vshr.s16    q7, q7, #5
425
426    vsub.s8     d8, d8, d29                 @ref_main_idx (row 6)
427    vtbl.8      d21, {d0,d1,d2,d3}, d4      @load from ref_main_idx (row 5)
428    vsub.s8     d9, d9, d29                 @ref_main_idx - 1 (row 6)
429
430    vmull.u8    q12, d12, d7                @mul (row 4)
431    vtbl.8      d17, {d0,d1,d2,d3}, d5      @load from ref_main_idx + 1 (row 5)
432    vqmovn.s16  d11, q7
433
434    vst1.8      d20, [r2], r3               @st (row 2)
435    vmlal.u8    q12, d13, d6                @mul (row 4)
436
437    vrshrn.i16  d18, q9, #5                 @round shft (row 3)
438    vdup.8      d26, r9
439
440    vsub.s8     d4, d4, d29                 @ref_main_idx (row 7)
441    vtbl.8      d14, {d0,d1,d2,d3}, d8      @load from ref_main_idx (row 6)
442    vsub.s8     d5, d5, d29                 @ref_main_idx - 1 (row 7)
443
444    mov         r6, #22                     @to compensate the 2*row value
445    vshl.u8     d11,#1
446    sub         r6, r6, r0, lsl #1
447
448    vmull.u8    q11, d21, d7                @mul (row 5)
449    vtbl.8      d15, {d0,d1,d2,d3}, d9      @load from ref_main_idx + 1 (row 6)
450    vmlal.u8    q11, d17, d6                @mul (row 5)
451
452    vst1.8      d18, [r2], r3               @st (row 3)
453    vrshrn.i16  d24, q12, #5                @round shft (row 4)
454
455    add         r2,r2,r3, lsl #2
456    vdup.8      d16, r6
457    addgt       r2, r7, r2
458
459    suble       r2, r2, r4
460    vsub.s8     d11, d11, d27               @ref_main_idx (add row)
461    suble       r2,r2,#8
462
463    subs        r10, r10, #4                @subtract 8 and go to end if 8x8
464
465    bne         kernel_8_16_32
466
467epil_8_16_32:
468    vtbl.8      d10, {d0,d1,d2,d3}, d4      @load from ref_main_idx (row 7)
469
470    vmull.u8    q10, d14, d7                @mul (row 6)
471    vtbl.8      d11, {d0,d1,d2,d3}, d5      @load from ref_main_idx + 1 (row 7)
472    vmlal.u8    q10, d15, d6                @mul (row 6)
473
474    vst1.8      d24, [r5], r3               @st (row 4)
475    vrshrn.i16  d24, q11, #5                @round shft (row 5)
476
477    vmull.u8    q9, d10, d7                 @mul (row 7)
478    vmlal.u8    q9, d11, d6                 @mul (row 7)
479
480    vst1.8      d24, [r5], r3               @(from previous loop)st (row 5)
481    vrshrn.i16  d20, q10, #5                @(from previous loop)round shft (row 6)
482
483    vst1.8      d20, [r5], r3               @(from previous loop)st (row 6)
484    vrshrn.i16  d18, q9, #5                 @(from previous loop)round shft (row 7)
485
486    vst1.8      d18, [r5], r3               @st (row 7)
487
488end_func:
489    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
490
491
492
493
494
495
496
497
498