1///*****************************************************************************
2//*
3//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4//*
5//* Licensed under the Apache License, Version 2.0 (the "License");
6//* you may not use this file except in compliance with the License.
7//* You may obtain a copy of the License at:
8//*
9//* http://www.apache.org/licenses/LICENSE-2.0
10//*
11//* Unless required by applicable law or agreed to in writing, software
12//* distributed under the License is distributed on an "AS IS" BASIS,
13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14//* See the License for the specific language governing permissions and
15//* limitations under the License.
16//*
17//*****************************************************************************/
18///**
19//*******************************************************************************
20//* @file
21//*  ihevc_intra_pred_chroma_dc_neon.s
22//*
23//* @brief
24//*  contains function definitions for intra prediction dc filtering.
25//* functions are coded using neon  intrinsics and can be compiled using
26
27//* rvct
28//*
29//* @author
30//*  yogeswaran rs
31//*
32//* @par list of functions:
33//*
34//*
35//* @remarks
36//*  none
37//*
38//*******************************************************************************
39//*/
40///**
41//*******************************************************************************
42//*
43//* @brief
44//*    luma intraprediction filter for dc input
45//*
46//* @par description:
47//*
48//* @param[in] pu1_ref
49//*  uword8 pointer to the source
50//*
51//* @param[out] pu1_dst
52//*  uword8 pointer to the destination
53//*
54//* @param[in] src_strd
55//*  integer source stride
56//*
57//* @param[in] dst_strd
58//*  integer destination stride
59//*
60//* @param[in] pi1_coeff
61//*  word8 pointer to the planar coefficients
62//*
63//* @param[in] nt
64//*  size of tranform block
65//*
66//* @param[in] mode
67//*  type of filtering
68//*
69//* @returns
70//*
71//* @remarks
72//*  none
73//*
74//*******************************************************************************
75//*/
76
77//void ihevc_intra_pred_chroma_dc(uword8 *pu1_ref,
78//                                word32 src_strd,
79//                                uword8 *pu1_dst,
80//                                word32 dst_strd,
81//                                word32 nt,
82//                                word32 mode)
83//
84//**************variables vs registers*****************************************
85//x0 => *pu1_ref
86//x1 => src_strd
87//x2 => *pu1_dst
88//x3 => dst_strd
89
90//stack contents from #40
91//    nt
92//    mode
93//    pi1_coeff
94
95.text
96.align 4
97.include "ihevc_neon_macros.s"
98
99
100
101.globl ihevc_intra_pred_chroma_dc_av8
102
103.type ihevc_intra_pred_chroma_dc_av8, %function
104
105ihevc_intra_pred_chroma_dc_av8:
106
107    // stmfd sp!, {x4-x12, x14}    //stack stores the values of the arguments
108    push_v_regs
109    stp         x19, x20,[sp,#-16]!
110
111    mov         x9, #0
112    mov         v17.s[0], w9
113    mov         v17.s[1], w9
114
115    clz         w5,w4                       //counts leading zeros
116
117    add         x6, x0, x4,lsl #1           //&src[2nt]
118    mov         v18.s[0], w9
119    mov         v18.s[1], w9
120    sub         x20, x5, #32                //log2nt
121    neg         x5, x20
122    add         x7, x0, x4, lsl #2          //&src[4nt]
123    mov         x12,x5
124    add         x8, x7, #2                  //&src[4nt+2]
125
126    cmp         x4, #4
127    beq         dc_4                        //nt=4 loop
128
129
130add_loop:
131    ld2         {v30.8b, v31.8b}, [x6], #16 //load from src[nt]
132    lsl         x10,x4,#1                   //2nt
133
134    uaddlp      v2.4h,  v30.8b
135    subs        x10, x10,#0x10
136
137    ld2         {v26.8b, v27.8b}, [x8],#16  //load from src[2nt+1]
138
139    uaddlp      v3.4h,  v31.8b
140    uaddlp      v2.2s,  v2.4h
141    uaddlp      v3.2s,  v3.4h
142
143    uadalp      v17.1d,  v2.2s
144
145    uadalp      v18.1d,  v3.2s
146
147    uaddlp      v2.4h,  v26.8b
148    uaddlp      v3.4h,  v27.8b
149
150    uaddlp      v2.2s,  v2.4h
151    uaddlp      v3.2s,  v3.4h
152
153    uadalp      v17.1d,  v2.2s
154    uadalp      v18.1d,  v3.2s
155
156    beq         epil_add_loop
157
158core_loop_add:
159    ld2         {v30.8b, v31.8b}, [x6],#16  //load from src[nt]
160    uaddlp      v28.4h,  v30.8b
161    uaddlp      v3.4h,  v31.8b
162
163    ld2         {v26.8b, v27.8b}, [x8],#16  //load from src[2nt+1]
164
165    uaddlp      v3.2s,  v3.4h
166    uaddlp      v29.2s,  v28.4h
167
168    uadalp      v18.1d,  v3.2s
169    uadalp      v17.1d,  v29.2s
170
171    uaddlp      v3.4h,  v27.8b
172    uaddlp      v28.4h,  v26.8b
173
174    uaddlp      v3.2s,  v3.4h
175    uaddlp      v29.2s,  v28.4h
176
177    uadalp      v18.1d,  v3.2s
178    uadalp      v17.1d,  v29.2s
179
180
181epil_add_loop:
182
183    smov        x1, v18.s[0]
184    smov        x11, v17.s[0]
185
186    add         x1,x1,x4
187    add         x11,x11,x4
188
189    lsr         x1,x1,x12
190    lsr         x11,x11,x12
191
192    dup         v17.8b,w1
193    dup         v16.8b,w11
194
195prologue_cpy_32:
196
197    add         x5, x2, x3
198    subs        x9, x4, #8
199    lsl         x6, x3, #2
200    csel        x11, x6, x11,eq
201    add         x8, x5, x3
202    add         x10, x8, x3
203
204    beq         epilogue_copy
205
206    st2         {v16.8b, v17.8b}, [x2],#16
207    sub         x6, x6, #16
208
209    st2         {v16.8b, v17.8b}, [x5],#16
210    st2         {v16.8b, v17.8b}, [x8],#16
211    mov         x20,#16
212    csel        x11, x20, x11,ne
213    st2         {v16.8b, v17.8b}, [x10],#16
214
215
216    st2         {v16.8b, v17.8b}, [x2], x6
217    st2         {v16.8b, v17.8b}, [x5], x6
218    st2         {v16.8b, v17.8b}, [x8], x6
219    st2         {v16.8b, v17.8b}, [x10], x6
220
221kernel_copy:
222    st2         {v16.8b, v17.8b}, [x2],#16
223    st2         {v16.8b, v17.8b}, [x5],#16
224    st2         {v16.8b, v17.8b}, [x8],#16
225    st2         {v16.8b, v17.8b}, [x10],#16
226
227    st2         {v16.8b, v17.8b}, [x2], x6
228    st2         {v16.8b, v17.8b}, [x5], x6
229    st2         {v16.8b, v17.8b}, [x8], x6
230    st2         {v16.8b, v17.8b}, [x10], x6
231
232    st2         {v16.8b, v17.8b}, [x2],#16
233    st2         {v16.8b, v17.8b}, [x5],#16
234    st2         {v16.8b, v17.8b}, [x8],#16
235    st2         {v16.8b, v17.8b}, [x10],#16
236
237    st2         {v16.8b, v17.8b}, [x2], x6
238    st2         {v16.8b, v17.8b}, [x5], x6
239    st2         {v16.8b, v17.8b}, [x8], x6
240    st2         {v16.8b, v17.8b}, [x10], x6
241
242epilogue_copy:
243    st2         {v16.8b, v17.8b}, [x2],x11
244    st2         {v16.8b, v17.8b}, [x5],x11
245    st2         {v16.8b, v17.8b}, [x8],x11
246    st2         {v16.8b, v17.8b}, [x10],x11
247
248    st2         {v16.8b, v17.8b}, [x2]
249    st2         {v16.8b, v17.8b}, [x5]
250    st2         {v16.8b, v17.8b}, [x8]
251    st2         {v16.8b, v17.8b}, [x10]
252    b           end_func
253
254dc_4:
255    ld2         {v30.8b, v31.8b},[x6]       //load from src[nt]
256    shl         d3, d30,#32
257
258    ld2         {v26.8b, v27.8b},[x8]       //load from src[2nt+1]
259    shl         d2, d31,#32
260
261    uaddlp      v3.4h,  v3.8b
262    uaddlp      v2.4h,  v2.8b
263    uaddlp      v3.2s,  v3.4h
264    uaddlp      v2.2s,  v2.4h
265    uadalp      v17.1d,  v3.2s
266    uadalp      v18.1d,  v2.2s
267
268    shl         d3, d26,#32
269    shl         d2, d27,#32
270    uaddlp      v3.4h,  v3.8b
271    uaddlp      v2.4h,  v2.8b
272    uaddlp      v3.2s,  v3.4h
273    uaddlp      v2.2s,  v2.4h
274    uadalp      v17.1d,  v3.2s
275    uadalp      v18.1d,  v2.2s
276
277    smov        x10, v17.s[0]
278    smov        x11, v18.s[0]
279
280    add         x10,x10,x4
281    add         x11,x11,x4
282    lsr         x10,x10,x12
283    lsr         x11,x11,x12
284    orr         x10,x10,x11,lsl #8
285    dup         v0.4h,w10
286
287    st1         {v0.8b},[x2],x3
288    st1         {v0.8b},[x2],x3
289    st1         {v0.8b},[x2],x3
290    st1         {v0.8b},[x2]
291
292end_func:
293    // ldmfd sp!,{x4-x12,x15}     //reload the registers from sp
294    ldp         x19, x20,[sp],#16
295    pop_v_regs
296    ret
297
298
299
300
301