1@/*****************************************************************************
2@*
3@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4@*
5@* Licensed under the Apache License, Version 2.0 (the "License");
6@* you may not use this file except in compliance with the License.
7@* You may obtain a copy of the License at:
8@*
9@* http://www.apache.org/licenses/LICENSE-2.0
10@*
11@* Unless required by applicable law or agreed to in writing, software
12@* distributed under the License is distributed on an "AS IS" BASIS,
13@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14@* See the License for the specific language governing permissions and
15@* limitations under the License.
16@*
17@*****************************************************************************/
18@/**
19@*******************************************************************************
20@* @file
21@*  ihevc_intra_pred_luma_mode2_neon.s
22@*
23@* @brief
24@*  contains function definitions for intra prediction dc filtering.
25@* functions are coded using neon  intrinsics and can be compiled using
26
27@* rvct
28@*
29@* @author
30@*  yogeswaran rs
31@*
32@* @par list of functions:
33@*
34@*
35@* @remarks
36@*  none
37@*
38@*******************************************************************************
39@*/
40@/**
41@*******************************************************************************
42@*
43@* @brief
44@*    luma intraprediction filter for dc input
45@*
46@* @par description:
47@*
48@* @param[in] pu1_ref
49@*  uword8 pointer to the source
50@*
51@* @param[out] pu1_dst
52@*  uword8 pointer to the destination
53@*
54@* @param[in] src_strd
55@*  integer source stride
56@*
57@* @param[in] dst_strd
58@*  integer destination stride
59@*
60@* @param[in] pi1_coeff
61@*  word8 pointer to the planar coefficients
62@*
63@* @param[in] nt
64@*  size of tranform block
65@*
66@* @param[in] mode
67@*  type of filtering
68@*
69@* @returns
70@*
71@* @remarks
72@*  none
73@*
74@*******************************************************************************
75@*/
76
77@void ihevc_intra_pred_luma_mode2(uword8 *pu1_ref,
78@                                 word32 src_strd,
79@                                 uword8 *pu1_dst,
80@                                 word32 dst_strd,
81@                                 word32 nt,
82@                                 word32 mode)
83@
84@**************variables vs registers*****************************************
85@r0 => *pu1_ref
86@r1 => src_strd
87@r2 => *pu1_dst
88@r3 => dst_strd
89
90@stack contents from #104
91@   nt
92@   mode
93@   pi1_coeff
94
95.equ    nt_offset,      104
96
97.text
98.align 4
99
100
101
102
103.globl ihevc_intra_pred_chroma_mode2_a9q
104
105.type ihevc_intra_pred_chroma_mode2_a9q, %function
106
107ihevc_intra_pred_chroma_mode2_a9q:
108
109    stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
110    vpush       {d8 - d15}
111
112    ldr         r4,[sp,#nt_offset]          @loads nt
113    mov         r8,#-4
114
115    cmp         r4,#4
116    beq         mode2_4
117
118    add         r0,r0,r4,lsl #2
119
120    sub         r0,r0,#0x12                 @src[1]
121    add         r10,r0,#-2
122
123prologue_cpy_32:
124
125    vld2.8      {d0,d1},[r0],r8
126
127    mov         r11,r4
128    vrev64.8    d16,d0
129    vrev64.8    d17,d1
130
131    vld2.8      {d2,d3},[r10],r8
132    mov         r6, r2
133
134    vld2.8      {d4,d5},[r0],r8
135    vld2.8      {d6,d7},[r10],r8
136    lsr         r1, r4, #3
137
138    vld2.8      {d8,d9},[r0],r8
139    vld2.8      {d10,d11},[r10],r8
140    vld2.8      {d12,d13},[r0],r8
141    mul         r1, r4, r1
142
143    vld2.8      {d14,d15},[r10],r8
144    add         r7,r6,r3
145
146    vrev64.8    d18,d2
147    vrev64.8    d19,d3
148    lsl         r5, r3, #2
149
150    vrev64.8    d20,d4
151    vrev64.8    d21,d5
152    add         r9,r7,r3
153
154    vrev64.8    d22,d6
155    vrev64.8    d23,d7
156
157    vrev64.8    d24,d8
158    vrev64.8    d25,d9
159
160    vrev64.8    d26,d10
161    subs        r1,r1,#8
162
163    vrev64.8    d27,d11
164
165    vrev64.8    d28,d12
166    vrev64.8    d29,d13
167
168    vrev64.8    d30,d14
169    add         r14,r9,r3
170    vrev64.8    d31,d15
171
172    beq         epilogue_mode2
173
174    sub         r12,r4,#8
175
176kernel_mode2:
177
178    vst2.8      {d16,d17},[r6],r5
179    vst2.8      {d18,d19},[r7],r5
180    subs        r11,r11,#8
181    vst2.8      {d20,d21},[r9],r5
182    vst2.8      {d22,d23},[r14],r5
183    vst2.8      {d24,d25},[r6],r5
184    addgt       r2,r2,#16
185    vst2.8      {d26,d27},[r7],r5
186    vst2.8      {d28,d29},[r9],r5
187    vst2.8      {d30,d31},[r14],r5
188
189    vld2.8      {d0,d1},[r0],r8
190    movle       r11,r4
191
192    vld2.8      {d2,d3},[r10],r8
193    vld2.8      {d4,d5},[r0],r8
194    addle       r2, r2, r3, lsl #2
195    vld2.8      {d6,d7},[r10],r8
196    vrev64.8    d16,d0
197
198    vld2.8      {d8,d9},[r0],r8
199    vld2.8      {d10,d11},[r10],r8
200    suble       r2, r6,#16
201    vld2.8      {d12,d13},[r0],r8
202    vrev64.8    d17,d1
203    vld2.8      {d14,d15},[r10],r8
204
205    subs        r12,r12,#8
206    mov         r6, r2
207    addle       r0, r0, r4,lsl #1
208    add         r7, r6, r3
209
210    vrev64.8    d18,d2
211    suble       r0, r0, #16
212    vrev64.8    d19,d3
213
214    vrev64.8    d20,d4
215    movle       r12,r4
216    vrev64.8    d21,d5
217
218    vrev64.8    d22,d6
219    add         r9, r7, r3
220    vrev64.8    d23,d7
221
222    vrev64.8    d24,d8
223    add         r10,r0,#-2
224    vrev64.8    d25,d9
225
226    vrev64.8    d26,d10
227    subs        r1, r1, #8
228    vrev64.8    d27,d11
229
230    vrev64.8    d28,d12
231    vrev64.8    d29,d13
232
233    vrev64.8    d30,d14
234    add         r14, r9, r3
235    vrev64.8    d31,d15
236
237    bne         kernel_mode2
238
239epilogue_mode2:
240
241    vst2.8      {d16,d17},[r6],r5
242    vst2.8      {d18,d19},[r7],r5
243    vst2.8      {d20,d21},[r9],r5
244    vst2.8      {d22,d23},[r14],r5
245    vst2.8      {d24,d25},[r6],r5
246    vst2.8      {d26,d27},[r7],r5
247    vst2.8      {d28,d29},[r9],r5
248    vst2.8      {d30,d31},[r14],r5
249
250    b           end_func
251
252mode2_4:
253
254    lsl         r12,r4,#1
255    add         r0,r0,r12
256    sub         r0,r0,#2
257
258    vld2.8      {d12,d13},[r0],r8
259    vshl.i64    d0,d12,#32
260    add         r10,r0,#2
261    vshl.i64    d1,d13,#32
262
263    vrev64.8    d0,d0
264    vld2.8      {d14,d15},[r10],r8
265    vshl.i64    d2,d14,#32
266
267    vrev64.8    d1,d1
268    vshl.i64    d3,d15,#32
269    vzip.8      d0,d1
270    vst1.8      {d0},[r2],r3
271
272    vrev64.8    d2,d2
273    vld2.8      {d16,d17},[r0],r8
274    vshl.i64    d4,d16,#32
275    vrev64.8    d3,d3
276    vshl.i64    d5,d17,#32
277    vzip.8      d2,d3
278    vrev64.8    d4,d4
279    vrev64.8    d5,d5
280    vst1.8      {d2},[r2],r3
281
282
283    vld2.8      {d18,d19},[r10],r8
284    vshl.i64    d6,d18,#32
285
286    vzip.8      d4,d5
287    vshl.i64    d7,d19,#32
288    vrev64.8    d6,d6
289    vst1.8      {d4},[r2],r3
290
291    vrev64.8    d7,d7
292    vzip.8      d6,d7
293    vst1.8      {d6},[r2],r3
294
295end_func:
296    vpop        {d8 - d15}
297    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
298
299
300
301
302
303
304