1///*****************************************************************************
2//*
3//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4//*
5//* Licensed under the Apache License, Version 2.0 (the "License");
6//* you may not use this file except in compliance with the License.
7//* You may obtain a copy of the License at:
8//*
9//* http://www.apache.org/licenses/LICENSE-2.0
10//*
11//* Unless required by applicable law or agreed to in writing, software
12//* distributed under the License is distributed on an "AS IS" BASIS,
13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14//* See the License for the specific language governing permissions and
15//* limitations under the License.
16//*
17//*****************************************************************************/
18///**
19///*******************************************************************************
20//* //file
21//*  ihevcd_itrans_recon_dc_chroma.s
22//*
23//* //brief
24//*  contains function definitions itrans and recon for dc only case
25//*
26//* //author
27//*  ittiam
28//*
29//* //par list of functions:
30//*
31//*
32//* //remarks
33//*  none
34//*
35//*******************************************************************************/
36
37
38.text
39.include "ihevc_neon_macros.s"
40
41
42.globl ihevcd_itrans_recon_dc_chroma_av8
43
44.type ihevcd_itrans_recon_dc_chroma_av8, %function
45
46ihevcd_itrans_recon_dc_chroma_av8:
47
48//void ihevcd_itrans_recon_dc_chroma(uword8 *pu1_pred,
49//                            uword8 *pu1_dst,
50//                            word32 pred_strd,
51//                            word32 dst_strd,
52//                            word32 log2_trans_size,
53//                            word16 i2_coeff_value)
54
55//x0:pu1_pred
56//x1:pu1_dest
57//x2:pred_strd
58//x3:dst_strd
59
60
61
62    push_v_regs
63    stp         x19, x20,[sp,#-16]!
64
65    sxth        x5, w5 // since the argument is of word16, sign extend to x register
66
67    mov         x10,#1
68    lsl         x4,x10,x4                   //    trans_size = (1 << log2_trans_size)//
69    mov         x6,#64                      // 1 << (shift1 - 1)//
70    mov         x7,#2048                    // 1<<(shift2-1)
71
72    add         x8,x6,x5,lsl #6
73    asr         x20, x8, #7
74    mov         x19,#32767
75    cmp         x20,x19
76    blt         lbl36
77    mov         x8,#32767
78    b           lbl36_1
79lbl36:
80    mov         x19,#-32768
81    cmp         x20,x19
82    csel        x8, x19, x20, lt
83lbl36_1:
84
85    add         x5,x7,x8,lsl #6
86    asr         x20, x5, #12
87    mov         x19,#32767
88    cmp         x20,x19
89    blt         lbl38
90    mov         x6,#32767
91    b           lbl38_1
92lbl38:
93    mov         x19,#-32768
94    cmp         x20,x19
95    csel        x6, x19, x20, lt
96lbl38_1:
97
98    mov         x9,x4
99    mov         x8,x4
100
101    // x6 has the dc_value
102    // x4 has the trans_size value
103    // x8 has the row value
104    // x9 has the col value
105    dup         v0.8h,w6
106    cmp         x4,#4
107    beq         row_loop_4chroma
108
109
110row_loop_chroma:
111    mov         x9,x4
112
113
114col_loop_chroma:
115
116    mov         x7,x0
117    ld2         {v2.8b, v3.8b},[x7],x2
118    ld2         {v4.8b, v5.8b},[x7],x2
119    ld2         {v6.8b, v7.8b},[x7],x2
120    ld2         {v8.8b, v9.8b},[x7],x2
121
122    ld2         {v10.8b, v11.8b},[x7],x2
123    ld2         {v12.8b, v13.8b},[x7],x2
124    ld2         {v14.8b, v15.8b},[x7],x2
125    ld2         {v16.8b, v17.8b},[x7]
126
127    add         x0,x0,#16
128
129
130    uaddw       v30.8h,  v0.8h ,  v2.8b
131    uaddw       v28.8h,  v0.8h ,  v4.8b
132    uaddw       v26.8h,  v0.8h ,  v6.8b
133    uaddw       v24.8h,  v0.8h ,  v8.8b
134    uaddw       v22.8h,  v0.8h ,  v10.8b
135    uaddw       v20.8h,  v0.8h ,  v12.8b
136    uaddw       v18.8h,  v0.8h ,  v14.8b
137
138
139    mov         x11,x1
140    sqxtun      v2.8b, v30.8h
141    sqxtun      v4.8b, v28.8h
142    sqxtun      v6.8b, v26.8h
143    sqxtun      v8.8b, v24.8h
144
145    uaddw       v30.8h,  v0.8h ,  v16.8b
146
147    sqxtun      v10.8b, v22.8h
148    sqxtun      v12.8b, v20.8h
149    sqxtun      v14.8b, v18.8h
150    sqxtun      v16.8b, v30.8h
151
152    st2         {v2.8b, v3.8b},[x11],x3
153    st2         {v4.8b, v5.8b},[x11],x3
154    st2         {v6.8b, v7.8b},[x11],x3
155    st2         {v8.8b, v9.8b},[x11],x3
156
157    st2         {v10.8b, v11.8b},[x11],x3
158    st2         {v12.8b, v13.8b},[x11],x3
159    st2         {v14.8b, v15.8b},[x11],x3
160    st2         {v16.8b, v17.8b},[x11]
161
162    add         x1,x1,#16
163
164    subs        x9,x9,#8
165    bgt         col_loop_chroma
166
167    subs        x8,x8,#8
168
169    add         x0,x0,x2,lsl #3
170    add         x1,x1,x3,lsl #3
171    sub         x0,x0,x4,lsl #1
172    sub         x1,x1,x4,lsl #1
173    bgt         row_loop_chroma
174    b           end_loops_chroma
175
176
177row_loop_4chroma:
178    mov         x9,x10
179
180
181col_loop_4chroma:
182
183
184    ld2         {v2.8b, v3.8b},[x0],x2
185    ld2         {v4.8b, v5.8b},[x0],x2
186    ld2         {v6.8b, v7.8b},[x0],x2
187    ld2         {v8.8b, v9.8b},[x0]
188
189
190
191
192    uaddw       v30.8h,  v0.8h ,  v2.8b
193    uaddw       v28.8h,  v0.8h ,  v4.8b
194    uaddw       v26.8h,  v0.8h ,  v6.8b
195    uaddw       v24.8h,  v0.8h ,  v8.8b
196
197
198
199    sqxtun      v31.8b, v30.8h
200    sqxtun      v29.8b, v28.8h
201    sqxtun      v27.8b, v26.8h
202    sqxtun      v25.8b, v24.8h
203
204
205    zip1        v2.8b, v31.8b, v3.8b
206    zip1        v4.8b, v29.8b, v5.8b
207    zip1        v6.8b, v27.8b, v7.8b
208    zip1        v8.8b, v25.8b, v9.8b
209
210    st1         {v2.2s},[x1],x3
211    st1         {v4.2s},[x1],x3
212    st1         {v6.2s},[x1],x3
213    st1         {v8.2s},[x1]
214
215end_loops_chroma:
216    ldp         x19, x20,[sp],#16
217    pop_v_regs
218    ret
219
220
221