1@/*****************************************************************************
2@*
3@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4@*
5@* Licensed under the Apache License, Version 2.0 (the "License");
6@* you may not use this file except in compliance with the License.
7@* You may obtain a copy of the License at:
8@*
9@* http://www.apache.org/licenses/LICENSE-2.0
10@*
11@* Unless required by applicable law or agreed to in writing, software
12@* distributed under the License is distributed on an "AS IS" BASIS,
13@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14@* See the License for the specific language governing permissions and
15@* limitations under the License.
16@*
17@*****************************************************************************/
18@/**
19@*******************************************************************************
20@* @file
21@*  ihevc_intra_pred_chroma_dc_neon.s
22@*
23@* @brief
24@*  contains function definitions for intra prediction dc filtering.
25@* functions are coded using neon  intrinsics and can be compiled using
26
27@* rvct
28@*
29@* @author
30@*  yogeswaran rs
31@*
32@* @par list of functions:
33@*
34@*
35@* @remarks
36@*  none
37@*
38@*******************************************************************************
39@*/
40@/**
41@*******************************************************************************
42@*
43@* @brief
44@*    luma intraprediction filter for dc input
45@*
46@* @par description:
47@*
48@* @param[in] pu1_ref
49@*  uword8 pointer to the source
50@*
51@* @param[out] pu1_dst
52@*  uword8 pointer to the destination
53@*
54@* @param[in] src_strd
55@*  integer source stride
56@*
57@* @param[in] dst_strd
58@*  integer destination stride
59@*
60@* @param[in] pi1_coeff
61@*  word8 pointer to the planar coefficients
62@*
63@* @param[in] nt
64@*  size of tranform block
65@*
66@* @param[in] mode
67@*  type of filtering
68@*
69@* @returns
70@*
71@* @remarks
72@*  none
73@*
74@*******************************************************************************
75@*/
76
77@void ihevc_intra_pred_chroma_dc(uword8 *pu1_ref,
78@                                word32 src_strd,
79@                                uword8 *pu1_dst,
80@                                word32 dst_strd,
81@                                word32 nt,
82@                                word32 mode)
83@
84@**************variables vs registers*****************************************
85@r0 => *pu1_ref
86@r1 => src_strd
87@r2 => *pu1_dst
88@r3 => dst_strd
89
90@stack contents from #40
91@   nt
92@   mode
93@   pi1_coeff
94
95.text
96.align 4
97
98
99
100
101.globl ihevc_intra_pred_chroma_dc_a9q
102
103.type ihevc_intra_pred_chroma_dc_a9q, %function
104
105ihevc_intra_pred_chroma_dc_a9q:
106
107    stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
108
109    ldr         r4,[sp,#40]                 @loads nt
110    mov         r9, #0
111    vmov        d17, r9, r9
112
113    clz         r5, r4                      @counts leading zeros
114
115    add         r6, r0, r4,lsl #1           @&src[2nt]
116    vmov        d18, r9, r9
117    rsb         r5, r5, #32                 @log2nt
118    add         r7, r0, r4, lsl #2          @&src[4nt]
119    mov         r12,r5
120    add         r8, r7, #2                  @&src[4nt+2]
121
122    cmp         r4, #4
123    beq         dc_4                        @nt=4 loop
124
125
126add_loop:
127    vld2.s8     {d30,d31}, [r6]!            @load from src[nt]
128    lsl         r10,r4,#1                   @2nt
129
130    vpaddl.u8   d2, d30
131    subs        r10, #0x10
132
133    vld2.s8     {d26,d27}, [r8]!            @load from src[2nt+1]
134
135    vpaddl.u8   d3, d31
136    vpaddl.u16  d2, d2
137    vpaddl.u16  d3, d3
138
139    vpadal.u32  d17, d2
140
141    vpadal.u32  d18, d3
142
143    vpaddl.u8   d2, d26
144    vpaddl.u8   d3, d27
145
146    vpaddl.u16  d2, d2
147    vpaddl.u16  d3, d3
148
149    vpadal.u32  d17, d2
150    vpadal.u32  d18, d3
151
152    beq         epil_add_loop
153
154core_loop_add:
155    vld2.s8     {d30,d31}, [r6]!            @load from src[nt]
156    vpaddl.u8   d28, d30
157    vpaddl.u8   d3, d31
158
159    vld2.s8     {d26,d27}, [r8]!            @load from src[2nt+1]
160
161    vpaddl.u16  d3, d3
162    vpaddl.u16  d29, d28
163
164    vpadal.u32  d18, d3
165    vpadal.u32  d17, d29
166
167    vpaddl.u8   d3, d27
168    vpaddl.u8   d28, d26
169
170    vpaddl.u16  d3, d3
171    vpaddl.u16  d29, d28
172
173    vpadal.u32  d18, d3
174    vpadal.u32  d17, d29
175
176
177epil_add_loop:
178
179    vmov.32     r1,d18[0]
180    vmov.32     r11,d17[0]
181
182    add         r1,r1,r4
183    add         r11,r11,r4
184
185    lsr         r1,r1,r12
186    lsr         r11,r11,r12
187
188    vdup.8      d17,r1
189    vdup.8      d16,r11
190
191prologue_cpy_32:
192
193    add         r5, r2, r3
194    subs        r9, r4, #8
195    lsl         r6, r3, #2
196    moveq       r11,r6
197    add         r8, r5, r3
198    add         r10, r8, r3
199
200    beq         epilogue_copy
201
202    vst2.8      {d16,d17}, [r2]!
203    add         r6, r6, #0xfffffff0
204
205    vst2.8      {d16,d17}, [r5]!
206    vst2.8      {d16,d17}, [r8]!
207    movne       r11,#16
208    vst2.8      {d16,d17}, [r10]!
209
210
211    vst2.8      {d16,d17}, [r2], r6
212    vst2.8      {d16,d17}, [r5], r6
213    vst2.8      {d16,d17}, [r8], r6
214    vst2.8      {d16,d17}, [r10], r6
215
216kernel_copy:
217    vst2.8      {d16,d17}, [r2]!
218    vst2.8      {d16,d17}, [r5]!
219    vst2.8      {d16,d17}, [r8]!
220    vst2.8      {d16,d17}, [r10]!
221
222    vst2.8      {d16,d17}, [r2], r6
223    vst2.8      {d16,d17}, [r5], r6
224    vst2.8      {d16,d17}, [r8], r6
225    vst2.8      {d16,d17}, [r10], r6
226
227    vst2.8      {d16,d17}, [r2]!
228    vst2.8      {d16,d17}, [r5]!
229    vst2.8      {d16,d17}, [r8]!
230    vst2.8      {d16,d17}, [r10]!
231
232    vst2.8      {d16,d17}, [r2], r6
233    vst2.8      {d16,d17}, [r5], r6
234    vst2.8      {d16,d17}, [r8], r6
235    vst2.8      {d16,d17}, [r10], r6
236
237epilogue_copy:
238    vst2.8      {d16,d17}, [r2],r11
239    vst2.8      {d16,d17}, [r5],r11
240    vst2.8      {d16,d17}, [r8],r11
241    vst2.8      {d16,d17}, [r10],r11
242
243    vst2.8      {d16,d17}, [r2]
244    vst2.8      {d16,d17}, [r5]
245    vst2.8      {d16,d17}, [r8]
246    vst2.8      {d16,d17}, [r10]
247    b           end_func
248
249dc_4:
250    vld2.s8     {d30,d31},[r6]              @load from src[nt]
251    vshl.i64    d3,d30,#32
252
253    vld2.s8     {d26,d27},[r8]              @load from src[2nt+1]
254    vshl.i64    d2,d31,#32
255
256    vpaddl.u8   d3,d3
257    vpaddl.u8   d2,d2
258    vpaddl.u16  d3,d3
259    vpaddl.u16  d2,d2
260    vpadal.u32  d17,d3
261    vpadal.u32  d18,d2
262
263    vshl.i64    d3,d26,#32
264    vshl.i64    d2,d27,#32
265    vpaddl.u8   d3,d3
266    vpaddl.u8   d2,d2
267    vpaddl.u16  d3,d3
268    vpaddl.u16  d2,d2
269    vpadal.u32  d17,d3
270    vpadal.u32  d18,d2
271
272    vmov.32     r10,d17[0]
273    vmov.32     r11,d18[0]
274
275    add         r10,r10,r4
276    add         r11,r11,r4
277    lsr         r10,r10,r12
278    lsr         r11,r11,r12
279    orr         r10,r10,r11,lsl #8
280    vdup.16     d0,r10
281
282    vst1.8      {d0},[r2],r3
283    vst1.8      {d0},[r2],r3
284    vst1.8      {d0},[r2],r3
285    vst1.8      {d0},[r2]
286
287end_func:
288    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
289
290
291
292
293