1//******************************************************************************
2//*
3//* Copyright (C) 2015 The Android Open Source Project
4//*
5//* Licensed under the Apache License, Version 2.0 (the "License");
6//* you may not use this file except in compliance with the License.
7//* You may obtain a copy of the License at:
8//*
9//* http://www.apache.org/licenses/LICENSE-2.0
10//*
11//* Unless required by applicable law or agreed to in writing, software
12//* distributed under the License is distributed on an "AS IS" BASIS,
13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14//* See the License for the specific language governing permissions and
15//* limitations under the License.
16//*
17//*****************************************************************************
18//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19//*/
20///**
21// *******************************************************************************
22// * @file
23// *  ih264_mem_fns_neon.s
24// *
25// * @brief
26// *  Contains function definitions for memory manipulation
27// *
28// * @author
29// *     Naveen SR
30// *
31// * @par List of Functions:
32// *  - ih264_memcpy_av8()
33// *  - ih264_memcpy_mul_8_av8()
34// *  - ih264_memset_mul_8_av8()
35// *  - ih264_memset_16bit_mul_8_av8()
36// *  - ih264_memset_16bit_av8()
37// *
38// * @remarks
39// *  None
40// *
41// *******************************************************************************
42//*/
43
44.text
45.p2align 2
46.include "ih264_neon_macros.s"
47///**
48//*******************************************************************************
49//*
50//* @brief
51//*   memcpy of a 1d array
52//*
53//* @par Description:
54//*   Does memcpy of 8bit data from source to destination for 8,16 or 32 number of bytes
55//*
56//* @param[in] pu1_dst
57//*  UWORD8 pointer to the destination
58//*
59//* @param[in] pu1_src
60//*  UWORD8 pointer to the source
61//*
62//* @param[in] num_bytes
63//*  number of bytes to copy
64//* @returns
65//*
66//* @remarks
67//*  None
68//*
69//*******************************************************************************
70//*/
71//void ih264_memcpy_mul_8(UWORD8 *pu1_dst,
72//                      UWORD8 *pu1_src,
73//                      UWORD8 num_bytes)
74//**************Variables Vs Registers*************************
75//    x0 => *pu1_dst
76//    x1 => *pu1_src
77//    x2 => num_bytes
78
79
80
81
82
83    .global ih264_memcpy_mul_8_av8
84
85ih264_memcpy_mul_8_av8:
86
87loop_neon_memcpy_mul_8:
88    // Memcpy 8 bytes
89    ld1       {v0.8b}, [x1], #8
90    st1       {v0.8b}, [x0], #8
91
92    subs      x2, x2, #8
93    bne       loop_neon_memcpy_mul_8
94    ret
95
96
97
98//*******************************************************************************
99//*/
100//void ih264_memcpy(UWORD8 *pu1_dst,
101//                  UWORD8 *pu1_src,
102//                  UWORD8 num_bytes)
103//**************Variables Vs Registers*************************
104//    x0 => *pu1_dst
105//    x1 => *pu1_src
106//    x2 => num_bytes
107
108
109
110    .global ih264_memcpy_av8
111
112ih264_memcpy_av8:
113    subs      x2, x2, #8
114    blt       arm_memcpy
115loop_neon_memcpy:
116    // Memcpy 8 bytes
117    ld1       {v0.8b}, [x1], #8
118    st1       {v0.8b}, [x0], #8
119
120    subs      x2, x2, #8
121    bge       loop_neon_memcpy
122    cmn       x2, #8
123    beq       end_func1
124
125arm_memcpy:
126    add       x2, x2, #8
127
128loop_arm_memcpy:
129    ldrb      w3, [x1], #1
130    sxtw      x3, w3
131    strb      w3, [x0], #1
132    sxtw      x3, w3
133    subs      x2, x2, #1
134    bne       loop_arm_memcpy
135    ret
136end_func1:
137    ret
138
139
140//void ih264_memset_mul_8(UWORD8 *pu1_dst,
141//                       UWORD8 value,
142//                       UWORD8 num_bytes)
143//**************Variables Vs Registers*************************
144//    x0 => *pu1_dst
145//    x1 => value
146//    x2 => num_bytes
147
148
149    .global ih264_memset_mul_8_av8
150
151ih264_memset_mul_8_av8:
152
153// Assumptions: numbytes is either 8, 16 or 32
154    dup       v0.8b, w1
155loop_memset_mul_8:
156    // Memset 8 bytes
157    st1       {v0.8b}, [x0], #8
158
159    subs      x2, x2, #8
160    bne       loop_memset_mul_8
161
162    ret
163
164
165//void ih264_memset(UWORD8 *pu1_dst,
166//                       UWORD8 value,
167//                       UWORD8 num_bytes)
168//**************Variables Vs Registers*************************
169//    x0 => *pu1_dst
170//    x1 => value
171//    x2 => num_bytes
172
173
174
175    .global ih264_memset_av8
176
177ih264_memset_av8:
178    subs      x2, x2, #8
179    blt       arm_memset
180    dup       v0.8b, w1
181loop_neon_memset:
182    // Memcpy 8 bytes
183    st1       {v0.8b}, [x0], #8
184
185    subs      x2, x2, #8
186    bge       loop_neon_memset
187    cmn       x2, #8
188    beq       end_func2
189
190arm_memset:
191    add       x2, x2, #8
192
193loop_arm_memset:
194    strb      w1, [x0], #1
195    sxtw      x1, w1
196    subs      x2, x2, #1
197    bne       loop_arm_memset
198    ret
199end_func2:
200    ret
201
202
203
204
205
206//void ih264_memset_16bit_mul_8(UWORD16 *pu2_dst,
207//                                      UWORD16 value,
208//                                      UWORD8 num_words)
209//**************Variables Vs Registers*************************
210//    x0 => *pu2_dst
211//    x1 => value
212//    x2 => num_words
213
214
215    .global ih264_memset_16bit_mul_8_av8
216
217ih264_memset_16bit_mul_8_av8:
218
219// Assumptions: num_words is either 8, 16 or 32
220
221    // Memset 8 words
222    dup       v0.4h, w1
223loop_memset_16bit_mul_8:
224    st1       {v0.4h}, [x0], #8
225    st1       {v0.4h}, [x0], #8
226
227    subs      x2, x2, #8
228    bne       loop_memset_16bit_mul_8
229
230    ret
231
232
233
234//void ih264_memset_16bit(UWORD16 *pu2_dst,
235//                       UWORD16 value,
236//                       UWORD8 num_words)
237//**************Variables Vs Registers*************************
238//    x0 => *pu2_dst
239//    x1 => value
240//    x2 => num_words
241
242
243
244    .global ih264_memset_16bit_av8
245
246ih264_memset_16bit_av8:
247    subs      x2, x2, #8
248    blt       arm_memset_16bit
249    dup       v0.4h, w1
250loop_neon_memset_16bit:
251    // Memset 8 words
252    st1       {v0.4h}, [x0], #8
253    st1       {v0.4h}, [x0], #8
254
255    subs      x2, x2, #8
256    bge       loop_neon_memset_16bit
257    cmn       x2, #8
258    beq       end_func3
259
260arm_memset_16bit:
261    add       x2, x2, #8
262
263loop_arm_memset_16bit:
264    strh      w1, [x0], #2
265    sxtw      x1, w1
266    subs      x2, x2, #1
267    bne       loop_arm_memset_16bit
268    ret
269
270end_func3:
271    ret
272
273
274
275