1///*****************************************************************************
2//*
3//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4//*
5//* Licensed under the Apache License, Version 2.0 (the "License");
6//* you may not use this file except in compliance with the License.
7//* You may obtain a copy of the License at:
8//*
9//* http://www.apache.org/licenses/LICENSE-2.0
10//*
11//* Unless required by applicable law or agreed to in writing, software
12//* distributed under the License is distributed on an "AS IS" BASIS,
13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14//* See the License for the specific language governing permissions and
15//* limitations under the License.
16//*
17//*****************************************************************************/
18///**
19//*******************************************************************************
20//* @file
21//*  ihevc_intra_pred_luma_mode2_neon.s
22//*
23//* @brief
24//*  contains function definitions for intra prediction dc filtering.
25//* functions are coded using neon  intrinsics and can be compiled using
26
27//* rvct
28//*
29//* @author
30//*  yogeswaran rs
31//*
32//* @par list of functions:
33//*
34//*
35//* @remarks
36//*  none
37//*
38//*******************************************************************************
39//*/
40///**
41//*******************************************************************************
42//*
43//* @brief
44//*    luma intraprediction filter for dc input
45//*
46//* @par description:
47//*
48//* @param[in] pu1_ref
49//*  uword8 pointer to the source
50//*
51//* @param[out] pu1_dst
52//*  uword8 pointer to the destination
53//*
54//* @param[in] src_strd
55//*  integer source stride
56//*
57//* @param[in] dst_strd
58//*  integer destination stride
59//*
60//* @param[in] pi1_coeff
61//*  word8 pointer to the planar coefficients
62//*
63//* @param[in] nt
64//*  size of tranform block
65//*
66//* @param[in] mode
67//*  type of filtering
68//*
69//* @returns
70//*
71//* @remarks
72//*  none
73//*
74//*******************************************************************************
75//*/
76
77//void ihevc_intra_pred_luma_mode2(uword8 *pu1_ref,
78//                                 word32 src_strd,
79//                                 uword8 *pu1_dst,
80//                                 word32 dst_strd,
81//                                 word32 nt,
82//                                 word32 mode)
83//
84//**************variables vs registers*****************************************
85//x0 => *pu1_ref
86//x1 => src_strd
87//x2 => *pu1_dst
88//x3 => dst_strd
89
90//stack contents from #40
91//    nt
92//    mode
93//    pi1_coeff
94
95.text
96.align 4
97.include "ihevc_neon_macros.s"
98
99
100
101.globl ihevc_intra_pred_luma_mode2_av8
102
103.type ihevc_intra_pred_luma_mode2_av8, %function
104
105ihevc_intra_pred_luma_mode2_av8:
106
107    // stmfd sp!, {x4-x12, x14}    //stack stores the values of the arguments
108
109    stp         x19, x20,[sp,#-16]!
110
111    mov         x8,#-2
112
113    cmp         x4,#4
114    beq         mode2_4
115
116    add         x0,x0,x4,lsl #1
117
118    sub         x0,x0,#9                    //src[1]
119    sub         x10,x0,#1
120
121prologue_cpy_32:
122
123    ld1         {v0.8b},[x0],x8
124    mov         x11,x4
125
126    ld1         {v1.8b},[x10],x8
127    mov         x6, x2
128
129    ld1         {v2.8b},[x0],x8
130    ld1         {v3.8b},[x10],x8
131    lsr         x1, x4, #3
132
133    ld1         {v4.8b},[x0],x8
134    ld1         {v5.8b},[x10],x8
135    ld1         {v6.8b},[x0],x8
136    mul         x1, x4, x1
137
138    ld1         {v7.8b},[x10],x8
139    add         x7,x6,x3
140
141    rev64       v16.8b,  v0.8b
142    rev64       v17.8b,  v1.8b
143    lsl         x5, x3, #2
144
145    rev64       v18.8b,  v2.8b
146    rev64       v19.8b,  v3.8b
147    add         x9,x7,x3
148
149    rev64       v20.8b,  v4.8b
150    subs        x1,x1,#8
151
152    rev64       v21.8b,  v5.8b
153    rev64       v22.8b,  v6.8b
154    rev64       v23.8b,  v7.8b
155    add         x14,x9,x3
156
157    beq         epilogue_mode2
158
159    sub         x12,x4,#8
160
161kernel_mode2:
162
163    st1         {v16.8b},[x6],x5
164    st1         {v17.8b},[x7],x5
165    subs        x11,x11,#8
166
167    st1         {v18.8b},[x9],x5
168    add         x20,x2,#8
169    csel        x2, x20, x2,gt
170
171    st1         {v19.8b},[x14],x5
172    st1         {v20.8b},[x6],x5
173    csel        x11, x4, x11,le
174
175    st1         {v21.8b},[x7],x5
176    st1         {v22.8b},[x9],x5
177    add         x20, x2, x3, lsl #2
178    csel        x2, x20, x2,le
179
180    st1         {v23.8b},[x14],x5
181    ld1         {v0.8b},[x0],x8
182    sub         x14,x4,#8
183
184    ld1         {v1.8b},[x10],x8
185    ld1         {v2.8b},[x0],x8
186    add         x20, x2, #8
187    csel        x2, x20, x2,le
188
189    ld1         {v3.8b},[x10],x8
190    ld1         {v4.8b},[x0],x8
191    sub         x20, x6, x14
192    csel        x2, x20, x2,le
193
194    ld1         {v5.8b},[x10],x8
195    subs        x12,x12,#8
196
197    ld1         {v6.8b},[x0],x8
198    mov         x6, x2
199
200    ld1         {v7.8b},[x10],x8
201    add         x20, x0, x4
202    csel        x0, x20, x0,le
203
204    rev64       v16.8b,  v0.8b
205    add         x7, x6, x3
206
207    rev64       v17.8b,  v1.8b
208    sub         x20, x0, #8
209    csel        x0, x20, x0,le
210
211    rev64       v18.8b,  v2.8b
212    csel        x12, x4, x12,le
213
214    rev64       v19.8b,  v3.8b
215    add         x9, x7, x3
216
217    rev64       v20.8b,  v4.8b
218    sub         x10,x0,#1
219
220    rev64       v21.8b,  v5.8b
221    subs        x1, x1, #8
222
223    rev64       v22.8b,  v6.8b
224    add         x14, x9, x3
225
226    rev64       v23.8b,  v7.8b
227
228    bne         kernel_mode2
229
230epilogue_mode2:
231
232    st1         {v16.8b},[x6],x5
233    st1         {v17.8b},[x7],x5
234    st1         {v18.8b},[x9],x5
235    st1         {v19.8b},[x14],x5
236    st1         {v20.8b},[x6],x5
237    st1         {v21.8b},[x7],x5
238    st1         {v22.8b},[x9],x5
239    st1         {v23.8b},[x14],x5
240
241    b           end_func
242
243mode2_4:
244
245    mov         x8,#-2
246    sub         x0,x0,#1
247    sub         x10,x0,#1
248
249    ld1         {v0.8b},[x0],x8
250    add         x5,x2,x3
251    ld1         {v2.8b},[x10],x8
252    add         x6,x5,x3
253    ld1         {v4.8b},[x0]
254    add         x7,x6,x3
255    ld1         {v6.8b},[x10]
256
257    rev64       v1.8b,  v0.8b
258    rev64       v3.8b,  v2.8b
259
260
261
262    st1         {v1.s}[0],[x2]
263    rev64       v5.8b,  v4.8b
264    st1         {v3.s}[0],[x5]
265    rev64       v7.8b,  v6.8b
266    st1         {v5.s}[0],[x6]
267    st1         {v7.s}[0],[x7]
268
269end_func:
270    // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
271    ldp         x19, x20,[sp],#16
272
273    ret
274
275
276
277
278
279
280
281