1//****************************************************************************** 2//* 3//* Copyright (C) 2015 The Android Open Source Project 4//* 5//* Licensed under the Apache License, Version 2.0 (the "License"); 6//* you may not use this file except in compliance with the License. 7//* You may obtain a copy of the License at: 8//* 9//* http://www.apache.org/licenses/LICENSE-2.0 10//* 11//* Unless required by applicable law or agreed to in writing, software 12//* distributed under the License is distributed on an "AS IS" BASIS, 13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14//* See the License for the specific language governing permissions and 15//* limitations under the License. 16//* 17//***************************************************************************** 18//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore 19//*/ 20 21//****************************************************************************** 22//* 23//* @brief 24//* This file contains definitions of routines for spatial filter 25//* 26//* @author 27//* Ittiam 28//* 29//* @par List of Functions: 30//* - ideint_cac_8x8_av8() 31//* 32//* @remarks 33//* None 34//* 35//******************************************************************************* 36 37 38//****************************************************************************** 39//* 40//* @brief Calculates Combing Artifact 41//* 42//* @par Description 43//* This functions calculates combing artifact check (CAC) for given two fields 44//* 45//* @param[in] pu1_top 46//* UWORD8 pointer to top field 47//* 48//* @param[in] pu1_bot 49//* UWORD8 pointer to bottom field 50//* 51//* @param[in] top_strd 52//* Top field stride 53//* 54//* @param[in] bot_strd 55//* Bottom field stride 56//* 57//* @returns 58//* None 59//* 60//* @remarks 61//* 62//****************************************************************************** 63 64 .global ideint_cac_8x8_av8 65 66ideint_cac_8x8_av8: 67 68 // Load first row of top 69 ld1 {v28.8b}, [x0], x2 70 71 // Load first row of bottom 72 ld1 {v29.8b}, [x1], x3 73 mov v28.d[1], v29.d[0] 74 75 // Load second row of top 76 ld1 {v30.8b}, [x0], x2 77 78 // Load second row of bottom 79 ld1 {v31.8b}, [x1], x3 80 mov v30.d[1], v31.d[0] 81 82 83 // Calculate row based adj and alt values 84 // Get row sums 85 uaddlp v0.8h, v28.16b 86 87 uaddlp v2.8h, v30.16b 88 89 uaddlp v0.4s, v0.8h 90 91 uaddlp v2.4s, v2.8h 92 93 // Both v0 and v2 have four 32 bit sums corresponding to first 4 rows 94 // Pack v0 and v2 into a single register (sum does not exceed 16bits) 95 96 shl v16.4s, v2.4s, #16 97 orr v16.16b, v0.16b, v16.16b 98 // v16 now contains 8 sums 99 100 // Load third row of top 101 ld1 {v24.8b}, [x0], x2 102 103 // Load third row of bottom 104 ld1 {v25.8b}, [x1], x3 105 mov v24.d[1], v25.d[0] 106 107 // Load fourth row of top 108 ld1 {v26.8b}, [x0], x2 109 110 // Load fourth row of bottom 111 ld1 {v27.8b}, [x1], x3 112 mov v26.d[1], v27.d[0] 113 114 // Get row sums 115 uaddlp v4.8h, v24.16b 116 117 uaddlp v6.8h, v26.16b 118 119 uaddlp v4.4s, v4.8h 120 121 uaddlp v6.4s, v6.8h 122 // Both v4 and v6 have four 32 bit sums corresponding to last 4 rows 123 // Pack v4 and v6 into a single register (sum does not exceed 16bits) 124 125 shl v18.4s, v6.4s, #16 126 orr v18.16b, v4.16b, v18.16b 127 // v18 now contains 8 sums 128 129 // Compute absolute diff between top and bottom row sums 130 mov v17.d[0], v16.d[1] 131 uabd v16.4h, v16.4h, v17.4h 132 133 mov v19.d[0], v18.d[1] 134 uabd v17.4h, v18.4h, v19.4h 135 136 mov v16.d[1], v17.d[0] 137 138 // RSUM_CSUM_THRESH 139 movi v18.8h, #20 140 141 // Eliminate values smaller than RSUM_CSUM_THRESH 142 cmhs v20.8h, v16.8h, v18.8h 143 and v20.16b, v16.16b, v20.16b 144 145 // v20 now contains 8 absolute diff of sums above the threshold 146 147 // Compute adj 148 mov v21.d[0], v20.d[1] 149 add v20.4h, v20.4h, v21.4h 150 151 // v20 has four adj values for two sub-blocks 152 153 // Compute alt 154 uabd v0.4s, v0.4s, v2.4s 155 uabd v4.4s, v4.4s, v6.4s 156 157 add v0.4s, v0.4s, v4.4s 158 159 mov v1.d[0], v0.d[1] 160 add v21.4s, v0.4s, v1.4s 161 // d21 has two values for two sub-blocks 162 163 164 // Calculate column based adj and alt values 165 166 urhadd v0.16b, v28.16b, v30.16b 167 urhadd v2.16b, v24.16b, v26.16b 168 urhadd v0.16b, v0.16b, v2.16b 169 170 mov v1.d[0], v0.d[1] 171 uabd v0.8b, v0.8b, v1.8b 172 173 // RSUM_CSUM_THRESH >> 2 174 movi v22.16b, #5 175 176 // Eliminate values smaller than RSUM_CSUM_THRESH >> 2 177 cmhs v1.16b, v0.16b, v22.16b 178 and v0.16b, v0.16b, v1.16b 179 // d0 now contains 8 absolute diff of sums above the threshold 180 181 182 uaddlp v0.4h, v0.8b 183 shl v0.4h, v0.4h,#2 184 185 // Add row based adj 186 add v20.4h, v0.4h, v20.4h 187 188 uaddlp v20.2s, v20.4h 189 // d20 now contains 2 adj values 190 191 192 urhadd v0.8b, v28.8b, v29.8b 193 urhadd v2.8b, v24.8b, v25.8b 194 urhadd v0.8b, v0.8b, v2.8b 195 196 urhadd v1.8b, v30.8b, v31.8b 197 urhadd v3.8b, v26.8b, v27.8b 198 urhadd v1.8b, v1.8b, v3.8b 199 200 uabd v0.8b, v0.8b, v1.8b 201 uaddlp v0.4h, v0.8b 202 203 shl v0.4h, v0.4h, #2 204 uaddlp v0.2s, v0.4h 205 add v21.2s, v0.2s, v21.2s 206 207 208 // d21 now contains 2 alt values 209 210 // SAD_BIAS_MULT_SHIFT 211 ushr v0.2s, v21.2s, #3 212 add v21.2s, v21.2s, v0.2s 213 214 // SAD_BIAS_ADDITIVE >> 1 215 movi v0.2s, #4 216 add v21.2s, v21.2s, v0.2s 217 218 cmhi v0.2s, v20.2s, v21.2s 219 uaddlp v0.1d, v0.2s 220 221 smov x0, v0.s[0] 222 cmp x0, #0 223 mov x4, #1 224 csel x0, x4, x0, ne 225 ret 226