1///*****************************************************************************
2//*
3//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4//*
5//* Licensed under the Apache License, Version 2.0 (the "License");
6//* you may not use this file except in compliance with the License.
7//* You may obtain a copy of the License at:
8//*
9//* http://www.apache.org/licenses/LICENSE-2.0
10//*
11//* Unless required by applicable law or agreed to in writing, software
12//* distributed under the License is distributed on an "AS IS" BASIS,
13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14//* See the License for the specific language governing permissions and
15//* limitations under the License.
16//*
17//*****************************************************************************/
18///**
19//*******************************************************************************
20//* @file
21//*  ihevc_intra_pred_chroma_horz_neon.s
22//*
23//* @brief
24//*  contains function definition for intra prediction  interpolation filters
25//*
26//*
27//* @author
28//*  parthiban v
29//*
30//* @par list of functions:
31//*  - ihevc_intra_pred_luma_horz()
32//*
33//* @remarks
34//*  none
35//*
36//*******************************************************************************
37//*/
38//
39///**
40//*******************************************************************************
41//*
42//* @brief
43//*     intra prediction interpolation filter for horizontal luma variable.
44//*
45//* @par description:
46//*      horizontal intraprediction(mode 10) with.extern  samples location
47//*      pointed by 'pu1_ref' to the tu block  location pointed by 'pu1_dst'  refer
48//*      to section 8.4.4.2.6 in the standard (special case)
49//*
50//* @param[in] pu1_src
51//*  uword8 pointer to the source
52//*
53//* @param[out] pu1_dst
54//*  uword8 pointer to the destination
55//*
56//* @param[in] src_strd
57//*  integer source stride
58//*
59//* @param[in] dst_strd
60//*  integer destination stride
61//*
62//* @param[in] nt
63//*  integer transform block size
64//*
65//* @param[in] mode
66//*  integer intraprediction mode
67//*
68//* @returns
69//*
70//* @remarks
71//*  none
72//*
73//*******************************************************************************
74//*/
75//void ihevc_intra_pred_chroma_horz(uword8 *pu1_ref,
76//                                  word32 src_strd,
77//                                  uword8 *pu1_dst,
78//                                  word32 dst_strd,
79//                                  word32 nt,
80//                                  word32 mode)
81//**************variables vs registers*****************************************
82//x0 => *pu1_ref
83//x1 =>  src_strd
84//x2 => *pu1_dst
85//x3 =>  dst_strd
86
87.text
88.align 4
89.include "ihevc_neon_macros.s"
90
91
92.globl ihevc_intra_pred_chroma_horz_av8
93
94.type ihevc_intra_pred_chroma_horz_av8, %function
95
96ihevc_intra_pred_chroma_horz_av8:
97
98    // stmfd sp!, {x4-x12, x14}                //stack stores the values of the arguments
99
100    stp         x19, x20,[sp,#-16]!
101
102    lsl         x6,x4,#2                    //four_nt
103
104    add         x12,x0,x6                   //*pu1_ref[four_nt]
105    cmp         x4,#4                       //if nt == 4
106    beq         core_loop_4
107
108    cmp         x4,#8                       //if nt == 8
109    beq         core_loop_8
110
111    //cmp            x4,#16                            @if nt == 16
112    //beq            core_loop_16
113
114    sub         x12,x12,#16                 //move to 16th value pointer
115    add         x9,x2,#16
116
117core_loop_16:
118    ld1         { v0.8h},[x12]              //load 16 values. d1[7] will have the 1st value.
119    sub         x12,x12,#16
120    ld1         { v18.8h},[x12]             //load 16 values. d1[7] will have the 1st value.
121
122    dup         v2.8h, v0.h[7]              //duplicate the i value.
123
124    dup         v4.8h, v0.h[6]              //duplicate the ii value.
125    dup         v6.8h, v0.h[5]              //duplicate the iii value.
126    st1         { v2.8h},[x2],x3            //store in 1st row 0-16 columns
127    st1         { v2.8h},[x9],x3            //store in 1st row 16-32 columns
128
129    dup         v1.8h, v0.h[4]
130    st1         { v4.8h},[x2],x3
131    st1         { v4.8h},[x9],x3
132
133    dup         v2.8h, v0.h[3]
134    st1         { v6.8h},[x2],x3
135    st1         { v6.8h},[x9],x3
136
137    dup         v4.8h, v0.h[2]
138    st1         { v1.8h},[x2],x3
139    st1         { v1.8h},[x9],x3
140
141    dup         v6.8h, v0.h[1]
142    st1         { v2.8h},[x2],x3
143    st1         { v2.8h},[x9],x3
144
145    dup         v1.8h, v0.h[0]
146    st1         { v4.8h},[x2],x3
147    st1         { v4.8h},[x9],x3
148
149    dup         v2.8h, v18.h[7]
150    st1         { v6.8h},[x2],x3
151    st1         { v6.8h},[x9],x3
152
153    dup         v4.8h, v18.h[6]
154    st1         { v1.8h},[x2],x3
155    st1         { v1.8h},[x9],x3
156
157    dup         v6.8h, v18.h[5]
158    st1         { v2.8h},[x2],x3
159    st1         { v2.8h},[x9],x3
160
161    dup         v1.8h, v18.h[4]
162    st1         { v4.8h},[x2],x3
163    st1         { v4.8h},[x9],x3
164
165    dup         v2.8h, v18.h[3]
166    st1         { v6.8h},[x2],x3
167    st1         { v6.8h},[x9],x3
168
169    dup         v4.8h, v18.h[2]
170    st1         { v1.8h},[x2],x3
171    st1         { v1.8h},[x9],x3
172
173    dup         v6.8h, v18.h[1]
174    st1         { v2.8h},[x2],x3
175    st1         { v2.8h},[x9],x3
176    sub         x12,x12,#16                 //move to 16th value pointer
177
178    dup         v1.8h, v18.h[0]
179    st1         { v4.8h},[x2],x3
180    st1         { v4.8h},[x9],x3
181
182    subs        x4,x4,#16                   //decrement the loop count by 16
183    st1         { v6.8h},[x2],x3
184    st1         { v6.8h},[x9],x3
185
186    st1         { v1.8h},[x2],x3
187    st1         { v1.8h},[x9],x3
188    bgt         core_loop_16
189    // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
190    ldp         x19, x20,[sp],#16
191
192    ret
193    b           endloop
194
195core_loop_8:
196    ldrb        w14,[x12],#1                //pu1_ref[two_nt]
197    sxtw        x14,w14
198    //vld1.8        {q15},[x12]                        @pu1_ref[two_nt + 1 + col]
199
200    dup         v28.8b,w14
201    sub         x12,x12,#17
202    ld1         { v0.16b},[x12]
203
204    sub         x12,x12,#16
205//    ld1 { v30.16b},[x12]
206    dup         v18.8h, v0.h[7]
207    //vmovl.u8    q13,d26
208
209    dup         v2.8h, v0.h[6]
210    //vsubl.u8    q12,d30,d28
211
212    dup         v4.8h, v0.h[5]
213    //vshr.s16    q12,q12,#1
214
215    dup         v6.8h, v0.h[4]
216    //vqadd.s16    q11,q13,q12
217
218    dup         v1.8h, v0.h[3]
219    //vqmovun.s16 d22,q11
220
221    st1         { v18.8h},[x2],x3
222
223    dup         v18.8h, v0.h[2]
224    //vsubl.u8    q12,d31,d28
225
226    dup         v19.8h, v0.h[1]
227    //vshr.s16    q12,q12,#1
228
229    dup         v20.8h, v0.h[0]
230    //vqadd.s16    q11,q13,q12
231
232    dup         v16.8h, v0.h[3]
233    //vqmovun.s16 d22,q11
234
235    st1         { v2.8h},[x2],x3
236    //sub            x2,x2,#8
237
238    st1         { v4.8h},[x2],x3
239
240    st1         { v6.8h},[x2],x3
241    st1         { v1.8h},[x2],x3
242    st1         { v18.8h},[x2],x3
243
244    //vdup.8        q1,d0[2]
245    st1         { v19.8h},[x2],x3
246
247    //vdup.8        q2,d0[1]
248    st1         { v20.8h},[x2],x3
249
250    //vdup.8        q3,d0[0]
251    //vst1.8        {q7},[x2],x3
252
253    //vdup.8        q4,d0[3]
254    //vst1.8        {q8},[x2],x3
255
256    //vdup.8        q5,d0[2]
257    //vst1.8        {q1},[x2],x3
258
259    //vdup.8        q6,d0[1]
260    //vst1.8        {q2},[x2],x3
261
262    //vdup.8        q7,d0[0]
263    //vst1.8        {q3},[x2],x3
264
265    //vst1.8        {q4},[x2],x3
266    //vst1.8        {q5},[x2],x3
267    //vst1.8        {q6},[x2],x3
268    //vst1.8        {q7},[x2],x3
269
270    // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
271    ldp         x19, x20,[sp],#16
272
273    ret
274    b           endloop
275
276
277core_loop_4:
278    ldrb        w14,[x12]                   //pu1_ref[two_nt]
279    sxtw        x14,w14
280    add         x12,x12,#1                  //pu1_ref[two_nt + 1]
281    //vld1.8        {d30},[x12]                        @pu1_ref[two_nt + 1 + col]
282
283    sub         x12,x12,#9
284    ld1         {v0.8b},[x12]
285    sub         x12,x12,#8
286    ld1         {v30.8b},[x12]
287    dup         v26.4h, v0.h[3]
288    dup         v28.8b,w14
289
290    dup         v3.4h, v0.h[2]
291    uxtl        v26.8h, v26.8b
292
293    dup         v4.4h, v0.h[1]
294    usubl       v24.8h, v30.8b, v28.8b
295
296    dup         v5.4h, v0.h[0]
297    sshr        v24.8h, v24.8h,#1
298
299    dup         v6.4h, v0.h[3]
300    sqadd       v22.8h,  v26.8h ,  v24.8h
301
302    dup         v7.4h, v0.h[2]
303    sqxtun      v22.8b, v22.8h
304
305    st1         {v6.8b},[x2],x3
306    st1         {v3.8b},[x2],x3
307
308    dup         v1.4h, v0.h[1]
309    st1         {v4.8b},[x2],x3
310    st1         {v5.8b},[x2],x3
311
312    dup         v17.4h, v0.h[0]
313    //vst1.8        {d6},[x2],x3
314    //vst1.8        {d7},[x2],x3
315
316    //vst1.8        {d8},[x2],x3
317    //vst1.8        {d9},[x2],x3
318    // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
319    ldp         x19, x20,[sp],#16
320
321    ret
322    b           endloop
323
324
325//core_loop_4
326    ldrb        w14,[x12]                   //pu1_ref[two_nt]
327    sxtw        x14,w14
328    add         x12,x12,#1                  //pu1_ref[two_nt + 1]
329    ld1         {v30.8b},[x12]              //pu1_ref[two_nt + 1 + col]
330
331    sub         x12,x12,#5
332    ld1         {v0.8b},[x12]
333    dup         v28.8b,w14
334    dup         v26.8b, v0.b[3]
335    uxtl        v26.8h, v26.8b
336
337    dup         v3.8b, v0.b[2]
338    usubl       v24.8h, v30.8b, v28.8b
339
340    dup         v4.8b, v0.b[1]
341    sshr        v24.8h, v24.8h,#1
342
343    dup         v5.8b, v0.b[0]
344    sqadd       v22.8h,  v26.8h ,  v24.8h
345
346    sqxtun      v22.8b, v22.8h
347
348    st1         {v22.s}[0],[x2],x3
349    st1         {v3.s}[0],[x2],x3
350    st1         {v4.s}[0],[x2],x3
351    st1         {v5.s}[0],[x2],x3
352
353    // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
354    ldp         x19, x20,[sp],#16
355
356    ret
357
358endloop:
359
360
361
362