1//******************************************************************************
2//*
3//* Copyright (C) 2015 The Android Open Source Project
4//*
5//* Licensed under the Apache License, Version 2.0 (the "License");
6//* you may not use this file except in compliance with the License.
7//* You may obtain a copy of the License at:
8//*
9//* http://www.apache.org/licenses/LICENSE-2.0
10//*
11//* Unless required by applicable law or agreed to in writing, software
12//* distributed under the License is distributed on an "AS IS" BASIS,
13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14//* See the License for the specific language governing permissions and
15//* limitations under the License.
16//*
17//*****************************************************************************
18//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19//*/
20///**
21// *******************************************************************************
22// * @file
23// *  ih264_mem_fns_neon.s
24// *
25// * @brief
26// *  Contains function definitions for memory manipulation
27// *
28// * @author
29// *     Naveen SR
30// *
31// * @par List of Functions:
32// *  - ih264_memcpy_av8()
33// *  - ih264_memcpy_mul_8_av8()
34// *  - ih264_memset_mul_8_av8()
35// *  - ih264_memset_16bit_mul_8_av8()
36// *  - ih264_memset_16bit_av8()
37// *
38// * @remarks
39// *  None
40// *
41// *******************************************************************************
42//*/
43
44.text
45.p2align 2
46.include "ih264_neon_macros.s"
47///**
48//*******************************************************************************
49//*
50//* @brief
51//*   memcpy of a 1d array
52//*
53//* @par Description:
54//*   Does memcpy of 8bit data from source to destination for 8,16 or 32 number of bytes
55//*
56//* @param[in] pu1_dst
57//*  UWORD8 pointer to the destination
58//*
59//* @param[in] pu1_src
60//*  UWORD8 pointer to the source
61//*
62//* @param[in] num_bytes
63//*  number of bytes to copy
64//* @returns
65//*
66//* @remarks
67//*  None
68//*
69//*******************************************************************************
70//*/
71//void ih264_memcpy_mul_8(UWORD8 *pu1_dst,
72//                      UWORD8 *pu1_src,
73//                      UWORD32 num_bytes)
74//**************Variables Vs Registers*************************
75//    x0 => *pu1_dst
76//    x1 => *pu1_src
77//    w2 => num_bytes
78
79
80
81
82
83    .global ih264_memcpy_mul_8_av8
84
85ih264_memcpy_mul_8_av8:
86
87loop_neon_memcpy_mul_8:
88    // Memcpy 8 bytes
89    ld1       {v0.8b}, [x1], #8
90    st1       {v0.8b}, [x0], #8
91
92    subs      w2, w2, #8
93    bne       loop_neon_memcpy_mul_8
94    ret
95
96
97
98//*******************************************************************************
99//*/
100//void ih264_memcpy(UWORD8 *pu1_dst,
101//                  UWORD8 *pu1_src,
102//                  UWORD32 num_bytes)
103//**************Variables Vs Registers*************************
104//    x0 => *pu1_dst
105//    x1 => *pu1_src
106//    w2 => num_bytes
107
108
109
110    .global ih264_memcpy_av8
111
112ih264_memcpy_av8:
113    subs      w2, w2, #8
114    blt       arm_memcpy
115loop_neon_memcpy:
116    // Memcpy 8 bytes
117    ld1       {v0.8b}, [x1], #8
118    st1       {v0.8b}, [x0], #8
119
120    subs      w2, w2, #8
121    bge       loop_neon_memcpy
122    cmn       w2, #8
123    beq       end_func1
124
125arm_memcpy:
126    add       w2, w2, #8
127
128loop_arm_memcpy:
129    ldrb      w3, [x1], #1
130    strb      w3, [x0], #1
131    subs      w2, w2, #1
132    bne       loop_arm_memcpy
133    ret
134end_func1:
135    ret
136
137
138//void ih264_memset_mul_8(UWORD8 *pu1_dst,
139//                       UWORD8 value,
140//                       UWORD32 num_bytes)
141//**************Variables Vs Registers*************************
142//    x0 => *pu1_dst
143//    x1 => value
144//    x2 => num_bytes
145
146
147    .global ih264_memset_mul_8_av8
148
149ih264_memset_mul_8_av8:
150
151// Assumptions: numbytes is either 8, 16 or 32
152    dup       v0.8b, w1
153loop_memset_mul_8:
154    // Memset 8 bytes
155    st1       {v0.8b}, [x0], #8
156
157    subs      w2, w2, #8
158    bne       loop_memset_mul_8
159
160    ret
161
162
163//void ih264_memset(UWORD8 *pu1_dst,
164//                       UWORD8 value,
165//                       UWORD32 num_bytes)
166//**************Variables Vs Registers*************************
167//    x0 => *pu1_dst
168//    w1 => value
169//    w2 => num_bytes
170
171
172
173    .global ih264_memset_av8
174
175ih264_memset_av8:
176    subs      w2, w2, #8
177    blt       arm_memset
178    dup       v0.8b, w1
179loop_neon_memset:
180    // Memcpy 8 bytes
181    st1       {v0.8b}, [x0], #8
182
183    subs      w2, w2, #8
184    bge       loop_neon_memset
185    cmn       w2, #8
186    beq       end_func2
187
188arm_memset:
189    add       w2, w2, #8
190
191loop_arm_memset:
192    strb      w1, [x0], #1
193    subs      w2, w2, #1
194    bne       loop_arm_memset
195    ret
196end_func2:
197    ret
198
199
200
201
202
203//void ih264_memset_16bit_mul_8(UWORD16 *pu2_dst,
204//                                      UWORD16 value,
205//                                      UWORD32 num_words)
206//**************Variables Vs Registers*************************
207//    x0 => *pu2_dst
208//    w1 => value
209//    w2 => num_words
210
211
212    .global ih264_memset_16bit_mul_8_av8
213
214ih264_memset_16bit_mul_8_av8:
215
216// Assumptions: num_words is either 8, 16 or 32
217
218    // Memset 8 words
219    dup       v0.4h, w1
220loop_memset_16bit_mul_8:
221    st1       {v0.4h}, [x0], #8
222    st1       {v0.4h}, [x0], #8
223
224    subs      w2, w2, #8
225    bne       loop_memset_16bit_mul_8
226
227    ret
228
229
230
231//void ih264_memset_16bit(UWORD16 *pu2_dst,
232//                       UWORD16 value,
233//                       UWORD32 num_words)
234//**************Variables Vs Registers*************************
235//    x0 => *pu2_dst
236//    w1 => value
237//    w2 => num_words
238
239
240
241    .global ih264_memset_16bit_av8
242
243ih264_memset_16bit_av8:
244    subs      w2, w2, #8
245    blt       arm_memset_16bit
246    dup       v0.4h, w1
247loop_neon_memset_16bit:
248    // Memset 8 words
249    st1       {v0.4h}, [x0], #8
250    st1       {v0.4h}, [x0], #8
251
252    subs      w2, w2, #8
253    bge       loop_neon_memset_16bit
254    cmn       w2, #8
255    beq       end_func3
256
257arm_memset_16bit:
258    add       w2, w2, #8
259
260loop_arm_memset_16bit:
261    strh      w1, [x0], #2
262    subs      w2, w2, #1
263    bne       loop_arm_memset_16bit
264    ret
265
266end_func3:
267    ret
268
269
270
271