1@/*****************************************************************************
2@*
3@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4@*
5@* Licensed under the Apache License, Version 2.0 (the "License");
6@* you may not use this file except in compliance with the License.
7@* You may obtain a copy of the License at:
8@*
9@* http://www.apache.org/licenses/LICENSE-2.0
10@*
11@* Unless required by applicable law or agreed to in writing, software
12@* distributed under the License is distributed on an "AS IS" BASIS,
13@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14@* See the License for the specific language governing permissions and
15@* limitations under the License.
16@*
17@*****************************************************************************/
18@/**
19@*******************************************************************************
20@* @file
21@*  ihevc_intra_pred_luma_mode2_neon.s
22@*
23@* @brief
24@*  contains function definitions for intra prediction dc filtering.
25@* functions are coded using neon  intrinsics and can be compiled using
26
27@* rvct
28@*
29@* @author
30@*  yogeswaran rs
31@*
32@* @par list of functions:
33@*
34@*
35@* @remarks
36@*  none
37@*
38@*******************************************************************************
39@*/
40@/**
41@*******************************************************************************
42@*
43@* @brief
44@*    luma intraprediction filter for dc input
45@*
46@* @par description:
47@*
48@* @param[in] pu1_ref
49@*  uword8 pointer to the source
50@*
51@* @param[out] pu1_dst
52@*  uword8 pointer to the destination
53@*
54@* @param[in] src_strd
55@*  integer source stride
56@*
57@* @param[in] dst_strd
58@*  integer destination stride
59@*
60@* @param[in] pi1_coeff
61@*  word8 pointer to the planar coefficients
62@*
63@* @param[in] nt
64@*  size of tranform block
65@*
66@* @param[in] mode
67@*  type of filtering
68@*
69@* @returns
70@*
71@* @remarks
72@*  none
73@*
74@*******************************************************************************
75@*/
76
77@void ihevc_intra_pred_luma_mode2(uword8 *pu1_ref,
78@                                 word32 src_strd,
79@                                 uword8 *pu1_dst,
80@                                 word32 dst_strd,
81@                                 word32 nt,
82@                                 word32 mode)
83@
84@**************variables vs registers*****************************************
85@r0 => *pu1_ref
86@r1 => src_strd
87@r2 => *pu1_dst
88@r3 => dst_strd
89
90@stack contents from #40
91@   nt
92@   mode
93@   pi1_coeff
94
95.text
96.align 4
97
98
99
100
101.globl ihevc_intra_pred_luma_mode2_a9q
102
103.type ihevc_intra_pred_luma_mode2_a9q, %function
104
105ihevc_intra_pred_luma_mode2_a9q:
106
107    stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
108
109    ldr         r4,[sp,#40]                 @loads nt
110    mov         r8,#-2
111
112    cmp         r4,#4
113    beq         mode2_4
114
115    add         r0,r0,r4,lsl #1
116
117    sub         r0,r0,#9                    @src[1]
118    add         r10,r0,#-1
119
120prologue_cpy_32:
121
122    vld1.8      {d0},[r0],r8
123    mov         r11,r4
124
125    vld1.8      {d1},[r10],r8
126    mov         r6, r2
127
128    vld1.8      {d2},[r0],r8
129    vld1.8      {d3},[r10],r8
130    lsr         r1, r4, #3
131
132    vld1.8      {d4},[r0],r8
133    vld1.8      {d5},[r10],r8
134    vld1.8      {d6},[r0],r8
135    mul         r1, r4, r1
136
137    vld1.8      {d7},[r10],r8
138    add         r7,r6,r3
139
140    vrev64.8    d8,d0
141    vrev64.8    d9,d1
142    lsl         r5, r3, #2
143
144    vrev64.8    d10,d2
145    vrev64.8    d11,d3
146    add         r9,r7,r3
147
148    vrev64.8    d12,d4
149    subs        r1,r1,#8
150
151    vrev64.8    d13,d5
152    vrev64.8    d14,d6
153    vrev64.8    d15,d7
154    add         r14,r9,r3
155
156    beq         epilogue_mode2
157
158    sub         r12,r4,#8
159
160kernel_mode2:
161
162    vst1.8      {d8},[r6],r5
163    vst1.8      {d9},[r7],r5
164    subs        r11,r11,#8
165
166    vst1.8      {d10},[r9],r5
167    addgt       r2,r2,#8
168
169    vst1.8      {d11},[r14],r5
170    vst1.8      {d12},[r6],r5
171    movle       r11,r4
172
173    vst1.8      {d13},[r7],r5
174    vst1.8      {d14},[r9],r5
175    addle       r2, r2, r3, lsl #2
176
177    vst1.8      {d15},[r14],r5
178    vld1.8      {d0},[r0],r8
179    sub         r14,r4,#8
180
181    vld1.8      {d1},[r10],r8
182    vld1.8      {d2},[r0],r8
183    addle       r2, r2, #8
184
185    vld1.8      {d3},[r10],r8
186    vld1.8      {d4},[r0],r8
187    suble       r2, r6, r14
188
189    vld1.8      {d5},[r10],r8
190    subs        r12,r12,#8
191
192    vld1.8      {d6},[r0],r8
193    mov         r6, r2
194
195    vld1.8      {d7},[r10],r8
196    addle       r0, r0, r4
197
198    vrev64.8    d8,d0
199    add         r7, r6, r3
200
201    vrev64.8    d9,d1
202    suble       r0, r0, #8
203
204    vrev64.8    d10,d2
205    movle       r12,r4
206
207    vrev64.8    d11,d3
208    add         r9, r7, r3
209
210    vrev64.8    d12,d4
211    add         r10,r0,#-1
212
213    vrev64.8    d13,d5
214    subs        r1, r1, #8
215
216    vrev64.8    d14,d6
217    add         r14, r9, r3
218
219    vrev64.8    d15,d7
220
221    bne         kernel_mode2
222
223epilogue_mode2:
224
225    vst1.8      {d8},[r6],r5
226    vst1.8      {d9},[r7],r5
227    vst1.8      {d10},[r9],r5
228    vst1.8      {d11},[r14],r5
229    vst1.8      {d12},[r6],r5
230    vst1.8      {d13},[r7],r5
231    vst1.8      {d14},[r9],r5
232    vst1.8      {d15},[r14],r5
233
234    b           end_func
235
236mode2_4:
237
238    mov         r8,#-2
239    sub         r0,r0,#1
240    add         r10,r0,#-1
241
242    vld1.8      {d0},[r0],r8
243    add         r5,r2,r3
244    vld1.8      {d2},[r10],r8
245    add         r6,r5,r3
246    vld1.8      {d4},[r0]
247    add         r7,r6,r3
248    vld1.8      {d6},[r10]
249
250    vrev64.8    d1,d0
251    vrev64.8    d3,d2
252
253
254
255    vst1.32     {d1[0]},[r2]
256    vrev64.8    d5,d4
257    vst1.32     {d3[0]},[r5]
258    vrev64.8    d7,d6
259    vst1.32     {d5[0]},[r6]
260    vst1.32     {d7[0]},[r7]
261
262end_func:
263    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
264
265
266
267
268
269
270
271