1; 2; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11%include "vpx_ports/x86_abi_support.asm" 12 13; tabulate_ssim - sums sum_s,sum_r,sum_sq_s,sum_sq_r, sum_sxr 14%macro TABULATE_SSIM 0 15 paddusw xmm15, xmm3 ; sum_s 16 paddusw xmm14, xmm4 ; sum_r 17 movdqa xmm1, xmm3 18 pmaddwd xmm1, xmm1 19 paddd xmm13, xmm1 ; sum_sq_s 20 movdqa xmm2, xmm4 21 pmaddwd xmm2, xmm2 22 paddd xmm12, xmm2 ; sum_sq_r 23 pmaddwd xmm3, xmm4 24 paddd xmm11, xmm3 ; sum_sxr 25%endmacro 26 27; Sum across the register %1 starting with q words 28%macro SUM_ACROSS_Q 1 29 movdqa xmm2,%1 30 punpckldq %1,xmm0 31 punpckhdq xmm2,xmm0 32 paddq %1,xmm2 33 movdqa xmm2,%1 34 punpcklqdq %1,xmm0 35 punpckhqdq xmm2,xmm0 36 paddq %1,xmm2 37%endmacro 38 39; Sum across the register %1 starting with q words 40%macro SUM_ACROSS_W 1 41 movdqa xmm1, %1 42 punpcklwd %1,xmm0 43 punpckhwd xmm1,xmm0 44 paddd %1, xmm1 45 SUM_ACROSS_Q %1 46%endmacro 47;void ssim_parms_sse2( 48; unsigned char *s, 49; int sp, 50; unsigned char *r, 51; int rp 52; unsigned long *sum_s, 53; unsigned long *sum_r, 54; unsigned long *sum_sq_s, 55; unsigned long *sum_sq_r, 56; unsigned long *sum_sxr); 57; 58; TODO: Use parm passing through structure, probably don't need the pxors 59; ( calling app will initialize to 0 ) could easily fit everything in sse2 60; without too much hastle, and can probably do better estimates with psadw 61; or pavgb At this point this is just meant to be first pass for calculating 62; all the parms needed for 16x16 ssim so we can play with dssim as distortion 63; in mode selection code. 64global sym(vp9_ssim_parms_16x16_sse2) PRIVATE 65sym(vp9_ssim_parms_16x16_sse2): 66 push rbp 67 mov rbp, rsp 68 SHADOW_ARGS_TO_STACK 9 69 SAVE_XMM 15 70 push rsi 71 push rdi 72 ; end prolog 73 74 mov rsi, arg(0) ;s 75 mov rcx, arg(1) ;sp 76 mov rdi, arg(2) ;r 77 mov rax, arg(3) ;rp 78 79 pxor xmm0, xmm0 80 pxor xmm15,xmm15 ;sum_s 81 pxor xmm14,xmm14 ;sum_r 82 pxor xmm13,xmm13 ;sum_sq_s 83 pxor xmm12,xmm12 ;sum_sq_r 84 pxor xmm11,xmm11 ;sum_sxr 85 86 mov rdx, 16 ;row counter 87.NextRow: 88 89 ;grab source and reference pixels 90 movdqu xmm5, [rsi] 91 movdqu xmm6, [rdi] 92 movdqa xmm3, xmm5 93 movdqa xmm4, xmm6 94 punpckhbw xmm3, xmm0 ; high_s 95 punpckhbw xmm4, xmm0 ; high_r 96 97 TABULATE_SSIM 98 99 movdqa xmm3, xmm5 100 movdqa xmm4, xmm6 101 punpcklbw xmm3, xmm0 ; low_s 102 punpcklbw xmm4, xmm0 ; low_r 103 104 TABULATE_SSIM 105 106 add rsi, rcx ; next s row 107 add rdi, rax ; next r row 108 109 dec rdx ; counter 110 jnz .NextRow 111 112 SUM_ACROSS_W xmm15 113 SUM_ACROSS_W xmm14 114 SUM_ACROSS_Q xmm13 115 SUM_ACROSS_Q xmm12 116 SUM_ACROSS_Q xmm11 117 118 mov rdi,arg(4) 119 movd [rdi], xmm15; 120 mov rdi,arg(5) 121 movd [rdi], xmm14; 122 mov rdi,arg(6) 123 movd [rdi], xmm13; 124 mov rdi,arg(7) 125 movd [rdi], xmm12; 126 mov rdi,arg(8) 127 movd [rdi], xmm11; 128 129 ; begin epilog 130 pop rdi 131 pop rsi 132 RESTORE_XMM 133 UNSHADOW_ARGS 134 pop rbp 135 ret 136 137;void ssim_parms_sse2( 138; unsigned char *s, 139; int sp, 140; unsigned char *r, 141; int rp 142; unsigned long *sum_s, 143; unsigned long *sum_r, 144; unsigned long *sum_sq_s, 145; unsigned long *sum_sq_r, 146; unsigned long *sum_sxr); 147; 148; TODO: Use parm passing through structure, probably don't need the pxors 149; ( calling app will initialize to 0 ) could easily fit everything in sse2 150; without too much hastle, and can probably do better estimates with psadw 151; or pavgb At this point this is just meant to be first pass for calculating 152; all the parms needed for 16x16 ssim so we can play with dssim as distortion 153; in mode selection code. 154global sym(vp9_ssim_parms_8x8_sse2) PRIVATE 155sym(vp9_ssim_parms_8x8_sse2): 156 push rbp 157 mov rbp, rsp 158 SHADOW_ARGS_TO_STACK 9 159 SAVE_XMM 15 160 push rsi 161 push rdi 162 ; end prolog 163 164 mov rsi, arg(0) ;s 165 mov rcx, arg(1) ;sp 166 mov rdi, arg(2) ;r 167 mov rax, arg(3) ;rp 168 169 pxor xmm0, xmm0 170 pxor xmm15,xmm15 ;sum_s 171 pxor xmm14,xmm14 ;sum_r 172 pxor xmm13,xmm13 ;sum_sq_s 173 pxor xmm12,xmm12 ;sum_sq_r 174 pxor xmm11,xmm11 ;sum_sxr 175 176 mov rdx, 8 ;row counter 177.NextRow: 178 179 ;grab source and reference pixels 180 movq xmm3, [rsi] 181 movq xmm4, [rdi] 182 punpcklbw xmm3, xmm0 ; low_s 183 punpcklbw xmm4, xmm0 ; low_r 184 185 TABULATE_SSIM 186 187 add rsi, rcx ; next s row 188 add rdi, rax ; next r row 189 190 dec rdx ; counter 191 jnz .NextRow 192 193 SUM_ACROSS_W xmm15 194 SUM_ACROSS_W xmm14 195 SUM_ACROSS_Q xmm13 196 SUM_ACROSS_Q xmm12 197 SUM_ACROSS_Q xmm11 198 199 mov rdi,arg(4) 200 movd [rdi], xmm15; 201 mov rdi,arg(5) 202 movd [rdi], xmm14; 203 mov rdi,arg(6) 204 movd [rdi], xmm13; 205 mov rdi,arg(7) 206 movd [rdi], xmm12; 207 mov rdi,arg(8) 208 movd [rdi], xmm11; 209 210 ; begin epilog 211 pop rdi 212 pop rsi 213 RESTORE_XMM 214 UNSHADOW_ARGS 215 pop rbp 216 ret 217