1///*****************************************************************************
2//*
3//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4//*
5//* Licensed under the Apache License, Version 2.0 (the "License");
6//* you may not use this file except in compliance with the License.
7//* You may obtain a copy of the License at:
8//*
9//* http://www.apache.org/licenses/LICENSE-2.0
10//*
11//* Unless required by applicable law or agreed to in writing, software
12//* distributed under the License is distributed on an "AS IS" BASIS,
13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14//* See the License for the specific language governing permissions and
15//* limitations under the License.
16//*
17//*****************************************************************************/
18///**
19//*******************************************************************************
20//* @file
21//*  ihevc_intra_pred_chroma_ver_neon.s
22//*
23//* @brief
24//*  contains function definitions for intra prediction dc filtering.
25//* functions are coded using neon  intrinsics and can be compiled using
26
27//* rvct
28//*
29//* @author
30//*  yogeswaran rs
31//*
32//* @par list of functions:
33//*
34//*
35//* @remarks
36//*  none
37//*
38//*******************************************************************************
39//*/
40///**
41//*******************************************************************************
42//*
43//* @brief
44//*    luma intraprediction filter for dc input
45//*
46//* @par description:
47//*
48//* @param[in] pu1_ref
49//*  uword8 pointer to the source
50//*
51//* @param[out] pu1_dst
52//*  uword8 pointer to the destination
53//*
54//* @param[in] src_strd
55//*  integer source stride
56//*
57//* @param[in] dst_strd
58//*  integer destination stride
59//*
60//* @param[in] nt
61//*  size of tranform block
62//*
63//* @param[in] mode
64//*  type of filtering
65//*
66//* @returns
67//*
68//* @remarks
69//*  none
70//*
71//*******************************************************************************
72//*/
73
74//void ihevc_intra_pred_chroma_ver(uword8 *pu1_ref,
75//        word32 src_strd,
76//        uword8 *pu1_dst,
77//        word32 dst_strd,
78//        word32 nt,
79//        word32 mode)
80//**************variables vs registers*****************************************
81//x0 => *pu1_ref
82//x1 => src_strd
83//x2 => *pu1_dst
84//x3 => dst_strd
85
86//stack contents from #40
87//    nt
88//    mode
89
90.text
91.align 4
92.include "ihevc_neon_macros.s"
93
94
95.globl ihevc_intra_pred_chroma_ver_av8
96
97.type ihevc_intra_pred_chroma_ver_av8, %function
98
99ihevc_intra_pred_chroma_ver_av8:
100
101    // stmfd sp!, {x4-x12, x14}            //stack stores the values of the arguments
102    push_v_regs
103    stp         x19, x20,[sp,#-16]!
104
105    lsl         x5, x4, #2                  //4nt
106
107
108    cmp         x4, #8
109    beq         blk_8
110    blt         blk_4
111
112copy_16:
113    add         x5, x5, #2                  //2nt+2
114    add         x6, x0, x5                  //&src[2nt+1]
115
116    add         x5, x2, x3                  //pu1_dst + dst_strd
117    ld2         {v20.8b, v21.8b}, [x6],#16  //16 loads (col 0:15)
118    add         x8, x5, x3
119
120    add         x10, x8, x3
121    ld2         {v22.8b, v23.8b}, [x6]      //16 loads (col 16:31)
122    lsl         x11, x3, #2
123
124    sub         x11, x11, #16
125
126
127    st2         {v20.8b, v21.8b}, [x2],#16
128    st2         {v20.8b, v21.8b}, [x5],#16
129    st2         {v20.8b, v21.8b}, [x8],#16
130    st2         {v20.8b, v21.8b}, [x10],#16
131
132    st2         {v22.8b, v23.8b}, [x2], x11
133    st2         {v22.8b, v23.8b}, [x5], x11
134    st2         {v22.8b, v23.8b}, [x8], x11
135    st2         {v22.8b, v23.8b}, [x10], x11
136
137    subs        x4, x4, #4
138
139kernel_copy_16:
140    st2         {v20.8b, v21.8b}, [x2],#16
141    st2         {v20.8b, v21.8b}, [x5],#16
142    st2         {v20.8b, v21.8b}, [x8],#16
143    st2         {v20.8b, v21.8b}, [x10],#16
144
145    st2         {v22.8b, v23.8b}, [x2], x11
146    st2         {v22.8b, v23.8b}, [x5], x11
147    st2         {v22.8b, v23.8b}, [x8], x11
148    st2         {v22.8b, v23.8b}, [x10], x11
149
150    subs        x4, x4, #4
151
152
153    st2         {v20.8b, v21.8b}, [x2],#16
154    st2         {v20.8b, v21.8b}, [x5],#16
155    st2         {v20.8b, v21.8b}, [x8],#16
156    st2         {v20.8b, v21.8b}, [x10],#16
157
158    st2         {v22.8b, v23.8b}, [x2], x11
159    st2         {v22.8b, v23.8b}, [x5], x11
160    st2         {v22.8b, v23.8b}, [x8], x11
161    st2         {v22.8b, v23.8b}, [x10], x11
162
163    subs        x4, x4, #4
164
165    st2         {v20.8b, v21.8b}, [x2],#16
166    st2         {v20.8b, v21.8b}, [x5],#16
167    st2         {v20.8b, v21.8b}, [x8],#16
168    st2         {v20.8b, v21.8b}, [x10],#16
169
170    st2         {v22.8b, v23.8b}, [x2], x11
171    st2         {v22.8b, v23.8b}, [x5], x11
172    st2         {v22.8b, v23.8b}, [x8], x11
173    st2         {v22.8b, v23.8b}, [x10], x11
174
175    subs        x4, x4, #4
176    bne         kernel_copy_16
177
178    b           end_func
179
180blk_8:
181
182    add         x5, x5, #2                  //2nt+2
183    add         x6, x0, x5                  //&src[2nt+1]
184
185    add         x5, x2, x3                  //pu1_dst + dst_strd
186    ld2         {v20.8b, v21.8b}, [x6],#16  //16 loads (col 0:15)
187    add         x8, x5, x3
188
189    add         x10, x8, x3
190    ld2         {v22.8b, v23.8b}, [x6]      //16 loads (col 16:31)
191
192    lsl         x11,x3,#2
193
194    st2         {v20.8b, v21.8b}, [x2],x11
195    st2         {v20.8b, v21.8b}, [x5],x11
196    st2         {v20.8b, v21.8b}, [x8],x11
197    st2         {v20.8b, v21.8b}, [x10],x11
198
199    st2         {v20.8b, v21.8b}, [x2]
200    st2         {v20.8b, v21.8b}, [x5]
201    st2         {v20.8b, v21.8b}, [x8]
202    st2         {v20.8b, v21.8b}, [x10]
203
204    subs        x4, x4, #8
205    beq         end_func
206
207blk_4:
208
209    //lsl        x5, x4, #2            @4nt
210    add         x5, x5, #2                  //2nt+2
211    add         x6, x0, x5                  //&src[2nt+1]
212
213    ld1         {v0.8b},[x6]
214    add         x5, x2, x3                  //pu1_dst + dst_strd
215
216    st1         {v0.8b},[x2]
217    add         x8, x5, x3
218    st1         {v0.8b},[x5]
219    add         x10, x8, x3
220    st1         {v0.8b},[x8]
221    st1         {v0.8b},[x10]
222
223
224
225end_func:
226    // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
227    ldp         x19, x20,[sp],#16
228    pop_v_regs
229    ret
230
231
232
233