1//******************************************************************************
2//*
3//* Copyright (C) 2015 The Android Open Source Project
4//*
5//* Licensed under the Apache License, Version 2.0 (the "License");
6//* you may not use this file except in compliance with the License.
7//* You may obtain a copy of the License at:
8//*
9//* http://www.apache.org/licenses/LICENSE-2.0
10//*
11//* Unless required by applicable law or agreed to in writing, software
12//* distributed under the License is distributed on an "AS IS" BASIS,
13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14//* See the License for the specific language governing permissions and
15//* limitations under the License.
16//*
17//*****************************************************************************
18//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19//*/
20
21//******************************************************************************
22//*
23//* @brief
24//*  This file contains definitions of routines for spatial filter
25//*
26//* @author
27//*  Ittiam
28//*
29//* @par List of Functions:
30//*  - ideint_cac_8x8_av8()
31//*
32//* @remarks
33//*  None
34//*
35//*******************************************************************************
36
37
38//******************************************************************************
39//*
40//*  @brief Calculates Combing Artifact
41//*
42//*  @par   Description
43//*   This functions calculates combing artifact check (CAC) for given two fields
44//*
45//* @param[in] pu1_top
46//*  UWORD8 pointer to top field
47//*
48//* @param[in] pu1_bot
49//*  UWORD8 pointer to bottom field
50//*
51//* @param[in] top_strd
52//*  Top field stride
53//*
54//* @param[in] bot_strd
55//*  Bottom field stride
56//*
57//* @returns
58//*     None
59//*
60//* @remarks
61//*
62//******************************************************************************
63
64    .global ideint_cac_8x8_av8
65
66ideint_cac_8x8_av8:
67
68    // Load first row of top
69    ld1     {v28.8b},       [x0],       x2
70
71    // Load first row of bottom
72    ld1     {v29.8b},       [x1],       x3
73    mov     v28.d[1],       v29.d[0]
74
75    // Load second row of top
76    ld1     {v30.8b},       [x0],       x2
77
78    // Load second row of bottom
79    ld1     {v31.8b},       [x1],       x3
80    mov     v30.d[1],       v31.d[0]
81
82
83    // Calculate row based adj and alt values
84    // Get row sums
85    uaddlp  v0.8h,          v28.16b
86
87    uaddlp  v2.8h,          v30.16b
88
89    uaddlp  v0.4s,          v0.8h
90
91    uaddlp  v2.4s,          v2.8h
92
93    // Both v0 and v2 have four 32 bit sums corresponding to first 4 rows
94    // Pack v0 and v2 into a single register (sum does not exceed 16bits)
95
96    shl     v16.4s,         v2.4s,      #16
97    orr     v16.16b,        v0.16b,     v16.16b
98    // v16 now contains 8 sums
99
100    // Load third row of top
101    ld1     {v24.8b},       [x0],       x2
102
103    // Load third row of bottom
104    ld1     {v25.8b},       [x1],       x3
105    mov     v24.d[1],       v25.d[0]
106
107    // Load fourth row of top
108    ld1     {v26.8b},       [x0],       x2
109
110    // Load fourth row of bottom
111    ld1     {v27.8b},       [x1],       x3
112    mov     v26.d[1],       v27.d[0]
113
114    // Get row sums
115    uaddlp  v4.8h,          v24.16b
116
117    uaddlp  v6.8h,          v26.16b
118
119    uaddlp  v4.4s,          v4.8h
120
121    uaddlp  v6.4s,          v6.8h
122    // Both v4 and v6 have four 32 bit sums corresponding to last 4 rows
123    // Pack v4 and v6 into a single register (sum does not exceed 16bits)
124
125    shl     v18.4s,         v6.4s,      #16
126    orr     v18.16b,        v4.16b,     v18.16b
127    // v18 now contains 8 sums
128
129    // Compute absolute diff between top and bottom row sums
130    mov     v17.d[0],       v16.d[1]
131    uabd    v16.4h,         v16.4h,     v17.4h
132
133    mov     v19.d[0],       v18.d[1]
134    uabd    v17.4h,         v18.4h,     v19.4h
135
136    mov     v16.d[1],       v17.d[0]
137
138    // RSUM_CSUM_THRESH
139    movi    v18.8h,         #20
140
141    // Eliminate values smaller than RSUM_CSUM_THRESH
142    cmhs    v20.8h,         v16.8h,     v18.8h
143    and     v20.16b,        v16.16b,    v20.16b
144
145    // v20 now contains 8 absolute diff of sums above the threshold
146
147    // Compute adj
148    mov     v21.d[0],       v20.d[1]
149    add     v20.4h,         v20.4h,     v21.4h
150
151    // v20 has four adj values for two sub-blocks
152
153    // Compute alt
154    uabd    v0.4s,      v0.4s,      v2.4s
155    uabd    v4.4s,      v4.4s,      v6.4s
156
157    add     v0.4s,      v0.4s,      v4.4s
158
159    mov     v1.d[0],    v0.d[1]
160    add     v21.4s,     v0.4s,      v1.4s
161    // d21 has two values for two sub-blocks
162
163
164    // Calculate column based adj and alt values
165
166    urhadd  v0.16b,     v28.16b,    v30.16b
167    urhadd  v2.16b,     v24.16b,    v26.16b
168    urhadd  v0.16b,     v0.16b,     v2.16b
169
170    mov     v1.d[0],    v0.d[1]
171    uabd    v0.8b,      v0.8b,      v1.8b
172
173    // RSUM_CSUM_THRESH >> 2
174    movi    v22.16b,        #5
175
176    // Eliminate values smaller than RSUM_CSUM_THRESH >> 2
177    cmhs    v1.16b,      v0.16b,        v22.16b
178    and     v0.16b,      v0.16b,        v1.16b
179    // d0 now contains 8 absolute diff of sums above the threshold
180
181
182    uaddlp  v0.4h,      v0.8b
183    shl     v0.4h,      v0.4h,#2
184
185    // Add row based adj
186    add     v20.4h,     v0.4h,      v20.4h
187
188    uaddlp  v20.2s,     v20.4h
189    // d20 now contains 2 adj values
190
191
192    urhadd  v0.8b,      v28.8b,     v29.8b
193    urhadd  v2.8b,      v24.8b,     v25.8b
194    urhadd  v0.8b,      v0.8b,      v2.8b
195
196    urhadd  v1.8b,      v30.8b,     v31.8b
197    urhadd  v3.8b,      v26.8b,     v27.8b
198    urhadd  v1.8b,      v1.8b,      v3.8b
199
200    uabd    v0.8b,      v0.8b,      v1.8b
201    uaddlp  v0.4h,      v0.8b
202
203    shl     v0.4h,      v0.4h,      #2
204    uaddlp  v0.2s,      v0.4h
205    add     v21.2s,     v0.2s,      v21.2s
206
207
208    // d21 now contains 2 alt values
209
210    // SAD_BIAS_MULT_SHIFT
211    ushr    v0.2s,      v21.2s,     #3
212    add     v21.2s,     v21.2s,     v0.2s
213
214    // SAD_BIAS_ADDITIVE >> 1
215    movi    v0.2s,      #4
216    add     v21.2s,     v21.2s,     v0.2s
217
218    cmhi    v0.2s,      v20.2s,     v21.2s
219    uaddlp  v0.1d,      v0.2s
220
221    smov    x0,         v0.s[0]
222    cmp     x0,         #0
223    mov     x4,         #1
224    csel    x0,         x4,         x0,         ne
225    ret
226