1; 2; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11%include "third_party/x86inc/x86inc.asm" 12 13SECTION .text 14 15%macro SAD_FN 4 16%if %4 == 0 17%if %3 == 5 18cglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows 19%else ; %3 == 7 20cglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, \ 21 src_stride3, ref_stride3, n_rows 22%endif ; %3 == 5/7 23%else ; avg 24%if %3 == 5 25cglobal sad%1x%2_avg, 5, 1 + %3, 5, src, src_stride, ref, ref_stride, \ 26 second_pred, n_rows 27%else ; %3 == 7 28cglobal sad%1x%2_avg, 5, ARCH_X86_64 + %3, 5, src, src_stride, \ 29 ref, ref_stride, \ 30 second_pred, \ 31 src_stride3, ref_stride3 32%if ARCH_X86_64 33%define n_rowsd r7d 34%else ; x86-32 35%define n_rowsd dword r0m 36%endif ; x86-32/64 37%endif ; %3 == 5/7 38%endif ; avg/sad 39 movsxdifnidn src_strideq, src_strided 40 movsxdifnidn ref_strideq, ref_strided 41%if %3 == 7 42 lea src_stride3q, [src_strideq*3] 43 lea ref_stride3q, [ref_strideq*3] 44%endif ; %3 == 7 45%endmacro 46 47; unsigned int vp9_sad64x64_sse2(uint8_t *src, int src_stride, 48; uint8_t *ref, int ref_stride); 49%macro SAD64XN 1-2 0 50 SAD_FN 64, %1, 5, %2 51 mov n_rowsd, %1 52 pxor m0, m0 53.loop: 54 movu m1, [refq] 55 movu m2, [refq+16] 56 movu m3, [refq+32] 57 movu m4, [refq+48] 58%if %2 == 1 59 pavgb m1, [second_predq+mmsize*0] 60 pavgb m2, [second_predq+mmsize*1] 61 pavgb m3, [second_predq+mmsize*2] 62 pavgb m4, [second_predq+mmsize*3] 63 lea second_predq, [second_predq+mmsize*4] 64%endif 65 psadbw m1, [srcq] 66 psadbw m2, [srcq+16] 67 psadbw m3, [srcq+32] 68 psadbw m4, [srcq+48] 69 paddd m1, m2 70 paddd m3, m4 71 add refq, ref_strideq 72 paddd m0, m1 73 add srcq, src_strideq 74 paddd m0, m3 75 dec n_rowsd 76 jg .loop 77 78 movhlps m1, m0 79 paddd m0, m1 80 movd eax, m0 81 RET 82%endmacro 83 84INIT_XMM sse2 85SAD64XN 64 ; sad64x64_sse2 86SAD64XN 32 ; sad64x32_sse2 87SAD64XN 64, 1 ; sad64x64_avg_sse2 88SAD64XN 32, 1 ; sad64x32_avg_sse2 89 90; unsigned int vp9_sad32x32_sse2(uint8_t *src, int src_stride, 91; uint8_t *ref, int ref_stride); 92%macro SAD32XN 1-2 0 93 SAD_FN 32, %1, 5, %2 94 mov n_rowsd, %1/2 95 pxor m0, m0 96.loop: 97 movu m1, [refq] 98 movu m2, [refq+16] 99 movu m3, [refq+ref_strideq] 100 movu m4, [refq+ref_strideq+16] 101%if %2 == 1 102 pavgb m1, [second_predq+mmsize*0] 103 pavgb m2, [second_predq+mmsize*1] 104 pavgb m3, [second_predq+mmsize*2] 105 pavgb m4, [second_predq+mmsize*3] 106 lea second_predq, [second_predq+mmsize*4] 107%endif 108 psadbw m1, [srcq] 109 psadbw m2, [srcq+16] 110 psadbw m3, [srcq+src_strideq] 111 psadbw m4, [srcq+src_strideq+16] 112 paddd m1, m2 113 paddd m3, m4 114 lea refq, [refq+ref_strideq*2] 115 paddd m0, m1 116 lea srcq, [srcq+src_strideq*2] 117 paddd m0, m3 118 dec n_rowsd 119 jg .loop 120 121 movhlps m1, m0 122 paddd m0, m1 123 movd eax, m0 124 RET 125%endmacro 126 127INIT_XMM sse2 128SAD32XN 64 ; sad32x64_sse2 129SAD32XN 32 ; sad32x32_sse2 130SAD32XN 16 ; sad32x16_sse2 131SAD32XN 64, 1 ; sad32x64_avg_sse2 132SAD32XN 32, 1 ; sad32x32_avg_sse2 133SAD32XN 16, 1 ; sad32x16_avg_sse2 134 135; unsigned int vp9_sad16x{8,16}_sse2(uint8_t *src, int src_stride, 136; uint8_t *ref, int ref_stride); 137%macro SAD16XN 1-2 0 138 SAD_FN 16, %1, 7, %2 139 mov n_rowsd, %1/4 140 pxor m0, m0 141 142.loop: 143 movu m1, [refq] 144 movu m2, [refq+ref_strideq] 145 movu m3, [refq+ref_strideq*2] 146 movu m4, [refq+ref_stride3q] 147%if %2 == 1 148 pavgb m1, [second_predq+mmsize*0] 149 pavgb m2, [second_predq+mmsize*1] 150 pavgb m3, [second_predq+mmsize*2] 151 pavgb m4, [second_predq+mmsize*3] 152 lea second_predq, [second_predq+mmsize*4] 153%endif 154 psadbw m1, [srcq] 155 psadbw m2, [srcq+src_strideq] 156 psadbw m3, [srcq+src_strideq*2] 157 psadbw m4, [srcq+src_stride3q] 158 paddd m1, m2 159 paddd m3, m4 160 lea refq, [refq+ref_strideq*4] 161 paddd m0, m1 162 lea srcq, [srcq+src_strideq*4] 163 paddd m0, m3 164 dec n_rowsd 165 jg .loop 166 167 movhlps m1, m0 168 paddd m0, m1 169 movd eax, m0 170 RET 171%endmacro 172 173INIT_XMM sse2 174SAD16XN 32 ; sad16x32_sse2 175SAD16XN 16 ; sad16x16_sse2 176SAD16XN 8 ; sad16x8_sse2 177SAD16XN 32, 1 ; sad16x32_avg_sse2 178SAD16XN 16, 1 ; sad16x16_avg_sse2 179SAD16XN 8, 1 ; sad16x8_avg_sse2 180 181; unsigned int vp9_sad8x{8,16}_sse2(uint8_t *src, int src_stride, 182; uint8_t *ref, int ref_stride); 183%macro SAD8XN 1-2 0 184 SAD_FN 8, %1, 7, %2 185 mov n_rowsd, %1/4 186 pxor m0, m0 187 188.loop: 189 movh m1, [refq] 190 movhps m1, [refq+ref_strideq] 191 movh m2, [refq+ref_strideq*2] 192 movhps m2, [refq+ref_stride3q] 193%if %2 == 1 194 pavgb m1, [second_predq+mmsize*0] 195 pavgb m2, [second_predq+mmsize*1] 196 lea second_predq, [second_predq+mmsize*2] 197%endif 198 movh m3, [srcq] 199 movhps m3, [srcq+src_strideq] 200 movh m4, [srcq+src_strideq*2] 201 movhps m4, [srcq+src_stride3q] 202 psadbw m1, m3 203 psadbw m2, m4 204 lea refq, [refq+ref_strideq*4] 205 paddd m0, m1 206 lea srcq, [srcq+src_strideq*4] 207 paddd m0, m2 208 dec n_rowsd 209 jg .loop 210 211 movhlps m1, m0 212 paddd m0, m1 213 movd eax, m0 214 RET 215%endmacro 216 217INIT_XMM sse2 218SAD8XN 16 ; sad8x16_sse2 219SAD8XN 8 ; sad8x8_sse2 220SAD8XN 4 ; sad8x4_sse2 221SAD8XN 16, 1 ; sad8x16_avg_sse2 222SAD8XN 8, 1 ; sad8x8_avg_sse2 223SAD8XN 4, 1 ; sad8x4_avg_sse2 224 225; unsigned int vp9_sad4x{4, 8}_sse(uint8_t *src, int src_stride, 226; uint8_t *ref, int ref_stride); 227%macro SAD4XN 1-2 0 228 SAD_FN 4, %1, 7, %2 229 mov n_rowsd, %1/4 230 pxor m0, m0 231 232.loop: 233 movd m1, [refq] 234 movd m2, [refq+ref_strideq] 235 movd m3, [refq+ref_strideq*2] 236 movd m4, [refq+ref_stride3q] 237 punpckldq m1, m2 238 punpckldq m3, m4 239%if %2 == 1 240 pavgb m1, [second_predq+mmsize*0] 241 pavgb m3, [second_predq+mmsize*1] 242 lea second_predq, [second_predq+mmsize*2] 243%endif 244 movd m2, [srcq] 245 movd m5, [srcq+src_strideq] 246 movd m4, [srcq+src_strideq*2] 247 movd m6, [srcq+src_stride3q] 248 punpckldq m2, m5 249 punpckldq m4, m6 250 psadbw m1, m2 251 psadbw m3, m4 252 lea refq, [refq+ref_strideq*4] 253 paddd m0, m1 254 lea srcq, [srcq+src_strideq*4] 255 paddd m0, m3 256 dec n_rowsd 257 jg .loop 258 259 movd eax, m0 260 RET 261%endmacro 262 263INIT_MMX sse 264SAD4XN 8 ; sad4x8_sse 265SAD4XN 4 ; sad4x4_sse 266SAD4XN 8, 1 ; sad4x8_avg_sse 267SAD4XN 4, 1 ; sad4x4_avg_sse 268