1//******************************************************************************
2//*
3//* Copyright (C) 2015 The Android Open Source Project
4//*
5//* Licensed under the Apache License, Version 2.0 (the "License");
6//* you may not use this file except in compliance with the License.
7//* You may obtain a copy of the License at:
8//*
9//* http://www.apache.org/licenses/LICENSE-2.0
10//*
11//* Unless required by applicable law or agreed to in writing, software
12//* distributed under the License is distributed on an "AS IS" BASIS,
13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14//* See the License for the specific language governing permissions and
15//* limitations under the License.
16//*
17//*****************************************************************************
18//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19//*/
20///**
21///**
22//*******************************************************************************
23//*
24//* @brief
25//*     Interprediction luma function for copy
26//*
27//* @par Description:
28//*   Copies the array of width 'wd' and height 'ht' from the  location pointed
29//*   by 'src' to the location pointed by 'dst'
30//*
31//* @param[in] pu1_src
32//*  UWORD8 pointer to the source
33//*
34//* @param[out] pu1_dst
35//*  UWORD8 pointer to the destination
36//*
37//* @param[in] src_strd
38//*  integer source stride
39//*
40//* @param[in] dst_strd
41//*  integer destination stride
42//*
43//*
44//* @param[in] ht
45//*  integer height of the array
46//*
47//* @param[in] wd
48//*  integer width of the array
49//*
50//* @returns
51//*
52//* @remarks
53//*  None
54//*
55//*******************************************************************************
56//*/
57//void ih264_inter_pred_luma_copy (
58//                            UWORD8 *pu1_src,
59//                            UWORD8 *pu1_dst,
60//                            WORD32 src_strd,
61//                            WORD32 dst_strd,
62//                            WORD32 ht,
63//                            WORD32 wd   )
64
65//**************Variables Vs Registers*****************************************
66//    x0 => *pu1_src
67//    x1 => *pu1_dst
68//    w2 =>  src_strd
69//    w3 =>  dst_strd
70//    w4 =>  ht
71//    w5 =>  wd
72
73.text
74.p2align 2
75.include "ih264_neon_macros.s"
76
77
78
79    .global ih264_inter_pred_luma_copy_av8
80
81ih264_inter_pred_luma_copy_av8:
82
83    push_v_regs
84    stp       x19, x20, [sp, #-16]!
85    sxtw      x2, w2
86    sxtw      x3, w3
87    sxtw      x4, w4
88    sxtw      x5, w5
89
90    mov       x12, x5
91    mov       x7, x4
92    cmp       x7, #0                    //checks ht == 0
93    ble       end_loops
94    tst       x12, #15                  //checks wd for multiples for 4 & 8
95    beq       core_loop_wd_16
96    tst       x12, #7                   //checks wd for multiples for 4 & 8
97    beq       core_loop_wd_8
98    sub       x11, x12, #4
99
100outer_loop_wd_4:
101    subs      x4, x12, #0               //checks wd == 0
102    ble       end_inner_loop_wd_4
103
104inner_loop_wd_4:
105    ld1       {v0.s}[0], [x0]           //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
106    add       x5, x0, x2                //pu1_src_tmp += src_strd
107    add       x6, x1, x3                //pu1_dst_tmp += dst_strd
108    st1       {v0.s}[0], [x1]           //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
109    ld1       {v0.s}[0], [x5], x2       //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
110    add       x0, x0, #4                //pu1_src += 4
111    st1       {v0.s}[0], [x6], x3       //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
112    ld1       {v0.s}[0], [x5], x2       //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
113    subs      x4, x4, #4                //(wd -4)
114    st1       {v0.s}[0], [x6], x3       //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
115    ld1       {v0.s}[0], [x5], x2       //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
116    add       x1, x1, #4                //pu1_dst += 4
117    st1       {v0.s}[0], [x6], x3       //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
118
119    bgt       inner_loop_wd_4
120
121end_inner_loop_wd_4:
122    subs      x7, x7, #4                //ht - 4
123    sub       x0, x5, x11               //pu1_src = pu1_src_tmp
124    sub       x1, x6, x11               //pu1_dst = pu1_dst_tmp
125    bgt       outer_loop_wd_4
126
127end_loops:
128    // LDMFD sp!,{x4-x12,x15}                  //Reload the registers from SP
129    ldp       x19, x20, [sp], #16
130    pop_v_regs
131    ret
132
133
134core_loop_wd_8:
135    sub       x11, x12, #8
136
137outer_loop_wd_8:
138    subs      x4, x12, #0               //checks wd
139    ble       end_inner_loop_wd_8
140
141inner_loop_wd_8:
142    add       x5, x0, x2                //pu1_src_tmp += src_strd
143    ld1       {v0.8b}, [x0], #8         //vld1_u8(pu1_src_tmp)
144    add       x6, x1, x3                //pu1_dst_tmp += dst_strd
145    st1       {v0.8b}, [x1], #8         //vst1_u8(pu1_dst_tmp, tmp_src)
146    ld1       {v1.8b}, [x5], x2         //vld1_u8(pu1_src_tmp)
147    st1       {v1.8b}, [x6], x3         //vst1_u8(pu1_dst_tmp, tmp_src)
148    subs      x4, x4, #8                //wd - 8(Loop condition)
149    ld1       {v2.8b}, [x5], x2         //vld1_u8(pu1_src_tmp)
150    st1       {v2.8b}, [x6], x3         //vst1_u8(pu1_dst_tmp, tmp_src)
151    ld1       {v3.8b}, [x5], x2         //vld1_u8(pu1_src_tmp)
152    st1       {v3.8b}, [x6], x3         //vst1_u8(pu1_dst_tmp, tmp_src)
153    bgt       inner_loop_wd_8
154
155end_inner_loop_wd_8:
156    subs      x7, x7, #4                //ht -= 4
157    sub       x0, x5, x11               //pu1_src = pu1_src_tmp
158    sub       x1, x6, x11               //pu1_dst = pu1_dst_tmp
159    bgt       outer_loop_wd_8
160
161    // LDMFD sp!,{x4-x12,x15}                  //Reload the registers from SP
162    ldp       x19, x20, [sp], #16
163    pop_v_regs
164    ret
165
166core_loop_wd_16:
167    sub       x11, x12, #16
168
169outer_loop_wd_16:
170    subs      x4, x12, #0               //checks wd
171    ble       end_inner_loop_wd_16
172
173inner_loop_wd_16:
174    add       x5, x0, x2                //pu1_src_tmp += src_strd
175    ld1       { v0.16b}, [x0], #16      //vld1_u8(pu1_src_tmp)
176    add       x6, x1, x3                //pu1_dst_tmp += dst_strd
177    st1       { v0.16b}, [x1], #16      //vst1_u8(pu1_dst_tmp, tmp_src)
178    ld1       { v2.16b}, [x5], x2       //vld1_u8(pu1_src_tmp)
179    st1       { v2.16b}, [x6], x3       //vst1_u8(pu1_dst_tmp, tmp_src)
180    subs      x4, x4, #16               //wd - 8(Loop condition)
181    ld1       { v4.16b}, [x5], x2       //vld1_u8(pu1_src_tmp)
182    st1       { v4.16b}, [x6], x3       //vst1_u8(pu1_dst_tmp, tmp_src)
183    ld1       { v6.16b}, [x5], x2       //vld1_u8(pu1_src_tmp)
184    st1       { v6.16b}, [x6], x3       //vst1_u8(pu1_dst_tmp, tmp_src)
185    bgt       inner_loop_wd_16
186
187end_inner_loop_wd_16:
188    subs      x7, x7, #4                //ht -= 4
189    sub       x0, x5, x11               //pu1_src = pu1_src_tmp
190    sub       x1, x6, x11               //pu1_dst = pu1_dst_tmp
191    bgt       outer_loop_wd_16
192
193
194    ldp       x19, x20, [sp], #16
195    pop_v_regs
196    ret
197
198
199// /*
200// ********************************************************************************
201// *
202// * @brief This function copies a 4x4 block to destination
203// *
204// * @par Description:
205// * Copies a 4x4 block to destination, where both src and dst are interleaved
206// *
207// * @param[in] pi2_src
208// *  Source
209// *
210// * @param[in] pu1_out
211// *  Output pointer
212// *
213// * @param[in] pred_strd,
214// *  Prediction buffer stride
215// *
216// * @param[in] out_strd
217// *  output buffer buffer Stride
218// *
219// * @returns none
220// *
221// * @remarks none
222// * Currently wd and height is not used, ie a 4x4 block is always copied
223// *
224// *******************************************************************************
225// */
226// void ih264_interleave_copy(WORD16 *pi2_src,
227//                            UWORD8 *pu1_out,
228//                            WORD32 pred_strd,
229//                            WORD32 out_strd
230//                            WORD32 wd
231//                            WORD32 ht)
232// Register Usage
233// x0 : pi2_src
234// x1 : pu1_out
235// w2 : src_strd
236// w3 : out_strd
237// Neon registers d0-d7, d16-d30 are used
238// No need for pushing  arm and neon registers
239
240    .global ih264_interleave_copy_av8
241ih264_interleave_copy_av8:
242    push_v_regs
243    sxtw      x2, w2
244    sxtw      x3, w3
245    ld1       {v2.8b}, [x0], x2         //load src plane 1 => d2 &pred palne 2 => d3
246    ld1       {v3.8b}, [x0], x2
247    mov       v2.d[1], v3.d[0]
248    ld1       {v4.8b}, [x0], x2
249    ld1       {v5.8b}, [x0], x2
250    mov       v4.d[1], v5.d[0]
251
252    mov       x0, x1
253
254    ld1       {v18.8b}, [x1], x3        //load out [8 bit size) -8 coeffs
255    ld1       {v19.8b}, [x1], x3
256    mov       v18.d[1], v19.d[0]
257    movi      v30.8h, #0x00ff
258    ld1       {v20.8b}, [x1], x3
259    ld1       {v21.8b}, [x1], x3
260    mov       v20.d[1], v21.d[0]
261
262    bit       v18.16b, v2.16b , v30.16b
263    bit       v20.16b, v4.16b , v30.16b
264
265    st1       {v18.8b}, [x0], x3        //store  out
266    st1       {v18.d}[1], [x0], x3
267    st1       {v20.8b}, [x0], x3
268    st1       {v20.d}[1], [x0], x3
269
270    pop_v_regs
271    ret
272
273
274