1@/*****************************************************************************
2@*
3@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4@*
5@* Licensed under the Apache License, Version 2.0 (the "License");
6@* you may not use this file except in compliance with the License.
7@* You may obtain a copy of the License at:
8@*
9@* http://www.apache.org/licenses/LICENSE-2.0
10@*
11@* Unless required by applicable law or agreed to in writing, software
12@* distributed under the License is distributed on an "AS IS" BASIS,
13@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14@* See the License for the specific language governing permissions and
15@* limitations under the License.
16@*
17@*****************************************************************************/
18@/**
19@*******************************************************************************
20@* @file
21@*  ihevc_intra_pred_chroma_horz_neon.s
22@*
23@* @brief
24@*  contains function definition for intra prediction  interpolation filters
25@*
26@*
27@* @author
28@*  parthiban v
29@*
30@* @par list of functions:
31@*  - ihevc_intra_pred_luma_horz()
32@*
33@* @remarks
34@*  none
35@*
36@*******************************************************************************
37@*/
38@
39@/**
40@*******************************************************************************
41@*
42@* @brief
43@*     intra prediction interpolation filter for horizontal luma variable.
44@*
45@* @par description:
46@*      horizontal intraprediction(mode 10) with.extern  samples location
47@*      pointed by 'pu1_ref' to the tu block  location pointed by 'pu1_dst'  refer
48@*      to section 8.4.4.2.6 in the standard (special case)
49@*
50@* @param[in] pu1_src
51@*  uword8 pointer to the source
52@*
53@* @param[out] pu1_dst
54@*  uword8 pointer to the destination
55@*
56@* @param[in] src_strd
57@*  integer source stride
58@*
59@* @param[in] dst_strd
60@*  integer destination stride
61@*
62@* @param[in] nt
63@*  integer transform block size
64@*
65@* @param[in] mode
66@*  integer intraprediction mode
67@*
68@* @returns
69@*
70@* @remarks
71@*  none
72@*
73@*******************************************************************************
74@*/
75@void ihevc_intra_pred_chroma_horz(uword8 *pu1_ref,
76@                                  word32 src_strd,
77@                                  uword8 *pu1_dst,
78@                                  word32 dst_strd,
79@                                  word32 nt,
80@                                  word32 mode)
81@**************variables vs registers*****************************************
82@r0 => *pu1_ref
83@r1 =>  src_strd
84@r2 => *pu1_dst
85@r3 =>  dst_strd
86
87.text
88.align 4
89
90
91
92
93.globl ihevc_intra_pred_chroma_horz_a9q
94
95.type ihevc_intra_pred_chroma_horz_a9q, %function
96
97ihevc_intra_pred_chroma_horz_a9q:
98
99    stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
100
101    ldr         r4,[sp,#40]                 @loads nt
102
103    lsl         r6,r4,#2                    @four_nt
104
105    add         r12,r0,r6                   @*pu1_ref[four_nt]
106    cmp         r4,#4                       @if nt == 4
107    beq         core_loop_4
108
109    cmp         r4,#8                       @if nt == 8
110    beq         core_loop_8
111
112    @cmp            r4,#16                          @if nt == 16
113    @beq            core_loop_16
114
115    sub         r12,r12,#16                 @move to 16th value pointer
116    add         r9,r2,#16
117
118core_loop_16:
119    vld1.16     {q0},[r12]                  @load 16 values. d1[7] will have the 1st value.
120    sub         r12,r12,#16
121    vld1.16     {q5},[r12]                  @load 16 values. d1[7] will have the 1st value.
122
123    vdup.16     q1,d1[3]                    @duplicate the i value.
124
125    vdup.16     q2,d1[2]                    @duplicate the ii value.
126    vdup.16     q3,d1[1]                    @duplicate the iii value.
127    vst1.16     {q1},[r2],r3                @store in 1st row 0-16 columns
128    vst1.16     {q1},[r9],r3                @store in 1st row 16-32 columns
129
130    vdup.16     q4,d1[0]
131    vst1.16     {q2},[r2],r3
132    vst1.16     {q2},[r9],r3
133
134    vdup.16     q1,d0[3]
135    vst1.16     {q3},[r2],r3
136    vst1.16     {q3},[r9],r3
137
138    vdup.16     q2,d0[2]
139    vst1.16     {q4},[r2],r3
140    vst1.16     {q4},[r9],r3
141
142    vdup.16     q3,d0[1]
143    vst1.16     {q1},[r2],r3
144    vst1.16     {q1},[r9],r3
145
146    vdup.16     q4,d0[0]
147    vst1.16     {q2},[r2],r3
148    vst1.16     {q2},[r9],r3
149
150    vdup.16     q1,d11[3]
151    vst1.16     {q3},[r2],r3
152    vst1.16     {q3},[r9],r3
153
154    vdup.16     q2,d11[2]
155    vst1.16     {q4},[r2],r3
156    vst1.16     {q4},[r9],r3
157
158    vdup.16     q3,d11[1]
159    vst1.16     {q1},[r2],r3
160    vst1.16     {q1},[r9],r3
161
162    vdup.16     q4,d11[0]
163    vst1.16     {q2},[r2],r3
164    vst1.16     {q2},[r9],r3
165
166    vdup.16     q1,d10[3]
167    vst1.16     {q3},[r2],r3
168    vst1.16     {q3},[r9],r3
169
170    vdup.16     q2,d10[2]
171    vst1.16     {q4},[r2],r3
172    vst1.16     {q4},[r9],r3
173
174    vdup.16     q3,d10[1]
175    vst1.16     {q1},[r2],r3
176    vst1.16     {q1},[r9],r3
177    sub         r12,r12,#16                 @move to 16th value pointer
178
179    vdup.16     q4,d10[0]
180    vst1.16     {q2},[r2],r3
181    vst1.16     {q2},[r9],r3
182
183    subs        r4,r4,#16                   @decrement the loop count by 16
184    vst1.16     {q3},[r2],r3
185    vst1.16     {q3},[r9],r3
186
187    vst1.16     {q4},[r2],r3
188    vst1.16     {q4},[r9],r3
189    bgt         core_loop_16
190    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
191    b           endloop
192
193core_loop_8:
194    ldrb        lr,[r12],#1                 @pu1_ref[two_nt]
195    @vld1.8     {q15},[r12]                     @pu1_ref[two_nt + 1 + col]
196
197    vdup.8      d28,lr
198    sub         r12,r12,#17
199    vld1.8      {q0},[r12]
200
201    sub         r12,r12,#16
202    vld1.8      {q15},[r12]
203    vdup.16     q5,d1[3]
204    @vmovl.u8   q13,d26
205
206    vdup.16     q1,d1[2]
207    @vsubl.u8   q12,d30,d28
208
209    vdup.16     q2,d1[1]
210    @vshr.s16   q12,q12,#1
211
212    vdup.16     q3,d1[0]
213    @vqadd.s16  q11,q13,q12
214
215    vdup.16     q4,d0[3]
216    @vqmovun.s16 d22,q11
217
218    vst1.16     {q5},[r2],r3
219
220    vdup.16     q5,d0[2]
221    @vsubl.u8   q12,d31,d28
222
223    vdup.16     q6,d0[1]
224    @vshr.s16   q12,q12,#1
225
226    vdup.16     q7,d0[0]
227    @vqadd.s16  q11,q13,q12
228
229    vdup.16     q8,d0[3]
230    @vqmovun.s16 d22,q11
231
232    vst1.16     {q1},[r2],r3
233    @sub            r2,r2,#8
234
235    vst1.16     {q2},[r2],r3
236
237    vst1.16     {q3},[r2],r3
238    vst1.16     {q4},[r2],r3
239    vst1.16     {q5},[r2],r3
240
241    @vdup.8     q1,d0[2]
242    vst1.16     {q6},[r2],r3
243
244    @vdup.8     q2,d0[1]
245    vst1.16     {q7},[r2],r3
246
247    @vdup.8     q3,d0[0]
248    @vst1.8     {q7},[r2],r3
249
250    @vdup.8     q4,d0[3]
251    @vst1.8     {q8},[r2],r3
252
253    @vdup.8     q5,d0[2]
254    @vst1.8     {q1},[r2],r3
255
256    @vdup.8     q6,d0[1]
257    @vst1.8     {q2},[r2],r3
258
259    @vdup.8     q7,d0[0]
260    @vst1.8     {q3},[r2],r3
261
262    @vst1.8     {q4},[r2],r3
263    @vst1.8     {q5},[r2],r3
264    @vst1.8     {q6},[r2],r3
265    @vst1.8     {q7},[r2],r3
266
267    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
268    b           endloop
269
270
271core_loop_4:
272    ldrb        lr,[r12]                    @pu1_ref[two_nt]
273    add         r12,r12,#1                  @pu1_ref[two_nt + 1]
274    @vld1.8     {d30},[r12]                     @pu1_ref[two_nt + 1 + col]
275
276    sub         r12,r12,#9
277    vld1.8      {d0},[r12]
278    sub         r12,r12,#8
279    vld1.8      {d30},[r12]
280    vdup.16     d26,d0[3]
281    vdup.8      d28,lr
282
283    vdup.16     d3,d0[2]
284    vmovl.u8    q13,d26
285
286    vdup.16     d4,d0[1]
287    vsubl.u8    q12,d30,d28
288
289    vdup.16     d5,d0[0]
290    vshr.s16    q12,q12,#1
291
292    vdup.16     d6,d0[3]
293    vqadd.s16   q11,q13,q12
294
295    vdup.16     d7,d0[2]
296    vqmovun.s16 d22,q11
297
298    vst1.8      {d6},[r2],r3
299    vst1.8      {d3},[r2],r3
300
301    vdup.16     d8,d0[1]
302    vst1.8      {d4},[r2],r3
303    vst1.8      {d5},[r2],r3
304
305    vdup.16     d9,d0[0]
306    @vst1.8     {d6},[r2],r3
307    @vst1.8     {d7},[r2],r3
308
309    @vst1.8     {d8},[r2],r3
310    @vst1.8     {d9},[r2],r3
311    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
312    b           endloop
313
314
315@core_loop_4
316    ldrb        lr,[r12]                    @pu1_ref[two_nt]
317    add         r12,r12,#1                  @pu1_ref[two_nt + 1]
318    vld1.8      {d30},[r12]                 @pu1_ref[two_nt + 1 + col]
319
320    sub         r12,r12,#5
321    vld1.8      {d0},[r12]
322    vdup.8      d28,lr
323    vdup.8      d26,d0[3]
324    vmovl.u8    q13,d26
325
326    vdup.8      d3,d0[2]
327    vsubl.u8    q12,d30,d28
328
329    vdup.8      d4,d0[1]
330    vshr.s16    q12,q12,#1
331
332    vdup.8      d5,d0[0]
333    vqadd.s16   q11,q13,q12
334
335    vqmovun.s16 d22,q11
336
337    vst1.32     {d22[0]},[r2],r3
338    vst1.32     {d3[0]},[r2],r3
339    vst1.32     {d4[0]},[r2],r3
340    vst1.32     {d5[0]},[r2],r3
341
342    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
343
344endloop:
345
346
347