1///*****************************************************************************
2//*
3//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4//*
5//* Licensed under the Apache License, Version 2.0 (the "License");
6//* you may not use this file except in compliance with the License.
7//* You may obtain a copy of the License at:
8//*
9//* http://www.apache.org/licenses/LICENSE-2.0
10//*
11//* Unless required by applicable law or agreed to in writing, software
12//* distributed under the License is distributed on an "AS IS" BASIS,
13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14//* See the License for the specific language governing permissions and
15//* limitations under the License.
16//*
17//*****************************************************************************/
18///**
19///*******************************************************************************
20//* //file
21//*  ihevcd_itrans_recon_dc_luma.s
22//*
23//* //brief
24//*  contains function definitions itrans and recon for dc only case
25//*
26//* //author
27//*  ittiam
28//*
29//* //par list of functions:
30//*
31//*
32//* //remarks
33//*  none
34//*
35//*******************************************************************************/
36
37.text
38.include "ihevc_neon_macros.s"
39
40
41
42.globl ihevcd_itrans_recon_dc_luma_av8
43
44.type ihevcd_itrans_recon_dc_luma_av8, %function
45
46ihevcd_itrans_recon_dc_luma_av8:
47
48//void ihevcd_itrans_recon_dc_luma(uword8 *pu1_pred,
49//                            uword8 *pu1_dst,
50//                            word32 pred_strd,
51//                            word32 dst_strd,
52//                            word32 log2_trans_size,
53//                            word16 i2_coeff_value)
54
55//x0:pu1_pred
56//x1:pu1_dest
57//x2:pred_strd
58//x3:dst_strd
59
60
61
62
63    stp         x19, x20,[sp,#-16]!
64    sxth        x5,w5
65
66    mov         x10,#1
67    lsl         x4,x10,x4                   //    trans_size = (1 << log2_trans_size)//
68    mov         x6,#64                      // 1 << (shift1 - 1)//
69    mov         x7,#2048                    // 1<<(shift2-1)
70
71    add         x8,x6,x5,lsl #6
72    asr         x20, x8, #7
73    mov         x19, #32767
74    cmp         x20,x19
75    blt         lbl37
76    mov         x8,#32767
77    b           lbl37_1
78lbl37:
79    mov         x19,#-32768
80    cmp         x20,x19
81    csel        x8, x19, x20, lt
82lbl37_1:
83
84    add         x5,x7,x8,lsl #6
85    asr         x20, x5, #12
86    mov         x19,#32767
87    cmp         x20,x19
88    blt         lbl39
89    mov         x6,#32767
90    b           lbl39_1
91lbl39:
92    mov         x19,#-32768
93    cmp         x20,x19
94    csel        x6, x19, x20, lt
95lbl39_1:
96
97    mov         x9,x4
98    mov         x8,x4
99
100    // x6 has the dc_value
101    // x4 has the trans_size value
102    // x8 has the row value
103    // x9 has the col value
104    dup         v0.8h,w6
105    cmp         x4,#4
106    beq         row_loop_4
107
108
109row_loop:
110    mov         x9,x4
111
112
113col_loop:
114
115    mov         x7,x0
116    ld1         {v2.8b},[x7],x2
117    ld1         {v3.8b},[x7],x2
118    ld1         {v4.8b},[x7],x2
119    ld1         {v5.8b},[x7],x2
120
121    ld1         {v6.8b},[x7],x2
122    ld1         {v7.8b},[x7],x2
123    ld1         {v1.8b},[x7],x2
124    ld1         {v17.8b},[x7]
125
126    add         x0,x0,#8
127
128
129    uaddw       v30.8h,  v0.8h ,  v2.8b
130    uaddw       v28.8h,  v0.8h ,  v3.8b
131    uaddw       v26.8h,  v0.8h ,  v4.8b
132    uaddw       v24.8h,  v0.8h ,  v5.8b
133    uaddw       v22.8h,  v0.8h ,  v6.8b
134    uaddw       v20.8h,  v0.8h ,  v7.8b
135    uaddw       v18.8h,  v0.8h ,  v1.8b
136    uaddw       v16.8h,  v0.8h ,  v17.8b
137
138    mov         x11,x1
139    sqxtun      v2.8b, v30.8h
140    sqxtun      v3.8b, v28.8h
141    sqxtun      v4.8b, v26.8h
142    sqxtun      v5.8b, v24.8h
143    sqxtun      v6.8b, v22.8h
144    sqxtun      v7.8b, v20.8h
145    sqxtun      v1.8b, v18.8h
146    sqxtun      v17.8b, v16.8h
147
148
149    st1         {v2.2s},[x11],x3
150    st1         {v3.2s},[x11],x3
151    st1         {v4.2s},[x11],x3
152    st1         {v5.2s},[x11],x3
153    st1         {v6.2s},[x11],x3
154    st1         {v7.2s},[x11],x3
155    st1         {v1.2s},[x11],x3
156    st1         {v17.2s},[x11]
157
158    add         x1,x1,#8
159
160    subs        x9,x9,#8
161    bgt         col_loop
162
163    subs        x8,x8,#8
164
165    add         x0,x0,x2,lsl #3
166    add         x1,x1,x3,lsl #3
167    sub         x0,x0,x4
168    sub         x1,x1,x4
169    bgt         row_loop
170    b           end_loops
171
172
173row_loop_4:
174    mov         x9,x10
175
176
177col_loop_4:
178
179
180    ld1         {v2.8b},[x0],x2
181    ld1         {v3.8b},[x0],x2
182    ld1         {v4.8b},[x0],x2
183    ld1         {v5.8b},[x0]
184
185
186
187
188    uaddw       v30.8h,  v0.8h ,  v2.8b
189    uaddw       v28.8h,  v0.8h ,  v3.8b
190    uaddw       v26.8h,  v0.8h ,  v4.8b
191    uaddw       v24.8h,  v0.8h ,  v5.8b
192
193
194
195    sqxtun      v2.8b, v30.8h
196    sqxtun      v3.8b, v28.8h
197    sqxtun      v4.8b, v26.8h
198    sqxtun      v5.8b, v24.8h
199
200
201
202    st1         {v2.s}[0],[x1],x3
203    st1         {v3.s}[0],[x1],x3
204    st1         {v4.s}[0],[x1],x3
205    st1         {v5.s}[0],[x1]
206
207end_loops:
208    ldp         x19, x20,[sp],#16
209
210    ret
211
212
213
214
215
216
217
218
219